diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala index 807ead7d3b..1684e627d6 100644 --- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala +++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala @@ -24,14 +24,14 @@ import org.bdgenomics.adam.rdd.{ GenomicDataset, GenomicDatasetConversion } -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rdd.variant.{ - VariantRDD, - GenotypeRDD, - VariantContextRDD + VariantDataset, + GenotypeDataset, + VariantContextDataset } import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, @@ -45,336 +45,336 @@ import org.bdgenomics.adam.sql.{ import org.bdgenomics.formats.avro._ import scala.reflect.runtime.universe._ -trait ToContigDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { +trait ToContigDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { val yTag: TypeTag[NucleotideContigFragmentProduct] = typeTag[NucleotideContigFragmentProduct] } -trait ToCoverageDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Coverage, Coverage, CoverageRDD] { +trait ToCoverageDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Coverage, Coverage, CoverageDataset] { val yTag: TypeTag[Coverage] = typeTag[Coverage] } -trait ToFeatureDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Feature, FeatureProduct, FeatureRDD] { +trait ToFeatureDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Feature, FeatureProduct, FeatureDataset] { val yTag: TypeTag[FeatureProduct] = typeTag[FeatureProduct] } -trait ToFragmentDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Fragment, FragmentProduct, FragmentRDD] { +trait ToFragmentDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Fragment, FragmentProduct, FragmentDataset] { val yTag: TypeTag[FragmentProduct] = typeTag[FragmentProduct] } -trait ToAlignmentRecordDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { +trait ToAlignmentRecordDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { val yTag: TypeTag[AlignmentRecordProduct] = typeTag[AlignmentRecordProduct] } -trait ToGenotypeDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Genotype, GenotypeProduct, GenotypeRDD] { +trait ToGenotypeDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Genotype, GenotypeProduct, GenotypeDataset] { val yTag: TypeTag[GenotypeProduct] = typeTag[GenotypeProduct] } -trait ToVariantDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Variant, VariantProduct, VariantRDD] { +trait ToVariantDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Variant, VariantProduct, VariantDataset] { val yTag: TypeTag[VariantProduct] = typeTag[VariantProduct] } -trait ToVariantContextDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, VariantContext, VariantContextProduct, VariantContextRDD] { +trait ToVariantContextDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, VariantContext, VariantContextProduct, VariantContextDataset] { val yTag: TypeTag[VariantContextProduct] = typeTag[VariantContextProduct] } -final class ContigsToCoverageDatasetConverter extends ToCoverageDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { +final class ContigsToCoverageDatasetConverter extends ToCoverageDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: Dataset[Coverage]): CoverageRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.contigsToCoverageDatasetConversionFn(v1, v2) } } -final class ContigsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { +final class ContigsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: Dataset[FeatureProduct]): FeatureRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { ADAMContext.contigsToFeaturesDatasetConversionFn(v1, v2) } } -final class ContigsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { +final class ContigsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: Dataset[FragmentProduct]): FragmentRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { ADAMContext.contigsToFragmentsDatasetConversionFn(v1, v2) } } -final class ContigsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { +final class ContigsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { ADAMContext.contigsToAlignmentRecordsDatasetConversionFn(v1, v2) } } -final class ContigsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { +final class ContigsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: Dataset[GenotypeProduct]): GenotypeRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { ADAMContext.contigsToGenotypesDatasetConversionFn(v1, v2) } } -final class ContigsToVariantsDatasetConverter extends ToVariantDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { +final class ContigsToVariantsDatasetConverter extends ToVariantDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: Dataset[VariantProduct]): VariantRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: Dataset[VariantProduct]): VariantDataset = { ADAMContext.contigsToVariantsDatasetConversionFn(v1, v2) } } -final class CoverageToContigsDatasetConverter extends ToContigDatasetConversion[Coverage, Coverage, CoverageRDD] { +final class CoverageToContigsDatasetConverter extends ToContigDatasetConversion[Coverage, Coverage, CoverageDataset] { - def call(v1: CoverageRDD, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { + def call(v1: CoverageDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { ADAMContext.coverageToContigsDatasetConversionFn(v1, v2) } } -final class CoverageToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Coverage, Coverage, CoverageRDD] { +final class CoverageToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Coverage, Coverage, CoverageDataset] { - def call(v1: CoverageRDD, v2: Dataset[FeatureProduct]): FeatureRDD = { + def call(v1: CoverageDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { ADAMContext.coverageToFeaturesDatasetConversionFn(v1, v2) } } -final class CoverageToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Coverage, Coverage, CoverageRDD] { +final class CoverageToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Coverage, Coverage, CoverageDataset] { - def call(v1: CoverageRDD, v2: Dataset[FragmentProduct]): FragmentRDD = { + def call(v1: CoverageDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { ADAMContext.coverageToFragmentsDatasetConversionFn(v1, v2) } } -final class CoverageToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Coverage, Coverage, CoverageRDD] { +final class CoverageToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Coverage, Coverage, CoverageDataset] { - def call(v1: CoverageRDD, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { + def call(v1: CoverageDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { ADAMContext.coverageToAlignmentRecordsDatasetConversionFn(v1, v2) } } -final class CoverageToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Coverage, Coverage, CoverageRDD] { +final class CoverageToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Coverage, Coverage, CoverageDataset] { - def call(v1: CoverageRDD, v2: Dataset[GenotypeProduct]): GenotypeRDD = { + def call(v1: CoverageDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { ADAMContext.coverageToGenotypesDatasetConversionFn(v1, v2) } } -final class CoverageToVariantsDatasetConverter extends ToVariantDatasetConversion[Coverage, Coverage, CoverageRDD] { +final class CoverageToVariantsDatasetConverter extends ToVariantDatasetConversion[Coverage, Coverage, CoverageDataset] { - def call(v1: CoverageRDD, v2: Dataset[VariantProduct]): VariantRDD = { + def call(v1: CoverageDataset, v2: Dataset[VariantProduct]): VariantDataset = { ADAMContext.coverageToVariantsDatasetConversionFn(v1, v2) } } -final class FeaturesToContigsDatasetConverter extends ToContigDatasetConversion[Feature, FeatureProduct, FeatureRDD] { +final class FeaturesToContigsDatasetConverter extends ToContigDatasetConversion[Feature, FeatureProduct, FeatureDataset] { - def call(v1: FeatureRDD, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { + def call(v1: FeatureDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { ADAMContext.featuresToContigsDatasetConversionFn(v1, v2) } } -final class FeaturesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Feature, FeatureProduct, FeatureRDD] { +final class FeaturesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Feature, FeatureProduct, FeatureDataset] { - def call(v1: FeatureRDD, v2: Dataset[Coverage]): CoverageRDD = { + def call(v1: FeatureDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.featuresToCoverageDatasetConversionFn(v1, v2) } } -final class FeaturesToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Feature, FeatureProduct, FeatureRDD] { +final class FeaturesToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Feature, FeatureProduct, FeatureDataset] { - def call(v1: FeatureRDD, v2: Dataset[FragmentProduct]): FragmentRDD = { + def call(v1: FeatureDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { ADAMContext.featuresToFragmentsDatasetConversionFn(v1, v2) } } -final class FeaturesToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Feature, FeatureProduct, FeatureRDD] { +final class FeaturesToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Feature, FeatureProduct, FeatureDataset] { - def call(v1: FeatureRDD, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { + def call(v1: FeatureDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { ADAMContext.featuresToAlignmentRecordsDatasetConversionFn(v1, v2) } } -final class FeaturesToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Feature, FeatureProduct, FeatureRDD] { +final class FeaturesToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Feature, FeatureProduct, FeatureDataset] { - def call(v1: FeatureRDD, v2: Dataset[GenotypeProduct]): GenotypeRDD = { + def call(v1: FeatureDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { ADAMContext.featuresToGenotypesDatasetConversionFn(v1, v2) } } -final class FeaturesToVariantsDatasetConverter extends ToVariantDatasetConversion[Feature, FeatureProduct, FeatureRDD] { +final class FeaturesToVariantsDatasetConverter extends ToVariantDatasetConversion[Feature, FeatureProduct, FeatureDataset] { - def call(v1: FeatureRDD, v2: Dataset[VariantProduct]): VariantRDD = { + def call(v1: FeatureDataset, v2: Dataset[VariantProduct]): VariantDataset = { ADAMContext.featuresToVariantsDatasetConversionFn(v1, v2) } } -final class FragmentsToContigsDatasetConverter extends ToContigDatasetConversion[Fragment, FragmentProduct, FragmentRDD] { +final class FragmentsToContigsDatasetConverter extends ToContigDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { - def call(v1: FragmentRDD, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { + def call(v1: FragmentDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { ADAMContext.fragmentsToContigsDatasetConversionFn(v1, v2) } } -final class FragmentsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Fragment, FragmentProduct, FragmentRDD] { +final class FragmentsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { - def call(v1: FragmentRDD, v2: Dataset[Coverage]): CoverageRDD = { + def call(v1: FragmentDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.fragmentsToCoverageDatasetConversionFn(v1, v2) } } -final class FragmentsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Fragment, FragmentProduct, FragmentRDD] { +final class FragmentsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { - def call(v1: FragmentRDD, v2: Dataset[FeatureProduct]): FeatureRDD = { + def call(v1: FragmentDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { ADAMContext.fragmentsToFeaturesDatasetConversionFn(v1, v2) } } -final class FragmentsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Fragment, FragmentProduct, FragmentRDD] { +final class FragmentsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { - def call(v1: FragmentRDD, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { + def call(v1: FragmentDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { ADAMContext.fragmentsToAlignmentRecordsDatasetConversionFn(v1, v2) } } -final class FragmentsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Fragment, FragmentProduct, FragmentRDD] { +final class FragmentsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { - def call(v1: FragmentRDD, v2: Dataset[GenotypeProduct]): GenotypeRDD = { + def call(v1: FragmentDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { ADAMContext.fragmentsToGenotypesDatasetConversionFn(v1, v2) } } -final class FragmentsToVariantsDatasetConverter extends ToVariantDatasetConversion[Fragment, FragmentProduct, FragmentRDD] { +final class FragmentsToVariantsDatasetConverter extends ToVariantDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { - def call(v1: FragmentRDD, v2: Dataset[VariantProduct]): VariantRDD = { + def call(v1: FragmentDataset, v2: Dataset[VariantProduct]): VariantDataset = { ADAMContext.fragmentsToVariantsDatasetConversionFn(v1, v2) } } -final class AlignmentRecordsToContigsDatasetConverter extends ToContigDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { +final class AlignmentRecordsToContigsDatasetConverter extends ToContigDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { - def call(v1: AlignmentRecordRDD, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { + def call(v1: AlignmentRecordDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { ADAMContext.alignmentRecordsToContigsDatasetConversionFn(v1, v2) } } -final class AlignmentRecordsToCoverageDatasetConverter extends ToCoverageDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { +final class AlignmentRecordsToCoverageDatasetConverter extends ToCoverageDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { - def call(v1: AlignmentRecordRDD, v2: Dataset[Coverage]): CoverageRDD = { + def call(v1: AlignmentRecordDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.alignmentRecordsToCoverageDatasetConversionFn(v1, v2) } } -final class AlignmentRecordsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { +final class AlignmentRecordsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { - def call(v1: AlignmentRecordRDD, v2: Dataset[FeatureProduct]): FeatureRDD = { + def call(v1: AlignmentRecordDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { ADAMContext.alignmentRecordsToFeaturesDatasetConversionFn(v1, v2) } } -final class AlignmentRecordsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { +final class AlignmentRecordsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { - def call(v1: AlignmentRecordRDD, v2: Dataset[FragmentProduct]): FragmentRDD = { + def call(v1: AlignmentRecordDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { ADAMContext.alignmentRecordsToFragmentsDatasetConversionFn(v1, v2) } } -final class AlignmentRecordsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { +final class AlignmentRecordsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { - def call(v1: AlignmentRecordRDD, v2: Dataset[GenotypeProduct]): GenotypeRDD = { + def call(v1: AlignmentRecordDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { ADAMContext.alignmentRecordsToGenotypesDatasetConversionFn(v1, v2) } } -final class AlignmentRecordsToVariantsDatasetConverter extends ToVariantDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { +final class AlignmentRecordsToVariantsDatasetConverter extends ToVariantDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { - def call(v1: AlignmentRecordRDD, v2: Dataset[VariantProduct]): VariantRDD = { + def call(v1: AlignmentRecordDataset, v2: Dataset[VariantProduct]): VariantDataset = { ADAMContext.alignmentRecordsToVariantsDatasetConversionFn(v1, v2) } } -final class GenotypesToContigsDatasetConverter extends ToContigDatasetConversion[Genotype, GenotypeProduct, GenotypeRDD] { +final class GenotypesToContigsDatasetConverter extends ToContigDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { - def call(v1: GenotypeRDD, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { + def call(v1: GenotypeDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { ADAMContext.genotypesToContigsDatasetConversionFn(v1, v2) } } -final class GenotypesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Genotype, GenotypeProduct, GenotypeRDD] { +final class GenotypesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { - def call(v1: GenotypeRDD, v2: Dataset[Coverage]): CoverageRDD = { + def call(v1: GenotypeDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.genotypesToCoverageDatasetConversionFn(v1, v2) } } -final class GenotypesToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Genotype, GenotypeProduct, GenotypeRDD] { +final class GenotypesToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { - def call(v1: GenotypeRDD, v2: Dataset[FeatureProduct]): FeatureRDD = { + def call(v1: GenotypeDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { ADAMContext.genotypesToFeaturesDatasetConversionFn(v1, v2) } } -final class GenotypesToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Genotype, GenotypeProduct, GenotypeRDD] { +final class GenotypesToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { - def call(v1: GenotypeRDD, v2: Dataset[FragmentProduct]): FragmentRDD = { + def call(v1: GenotypeDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { ADAMContext.genotypesToFragmentsDatasetConversionFn(v1, v2) } } -final class GenotypesToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Genotype, GenotypeProduct, GenotypeRDD] { +final class GenotypesToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { - def call(v1: GenotypeRDD, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { + def call(v1: GenotypeDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { ADAMContext.genotypesToAlignmentRecordsDatasetConversionFn(v1, v2) } } -final class GenotypesToVariantsDatasetConverter extends ToVariantDatasetConversion[Genotype, GenotypeProduct, GenotypeRDD] { +final class GenotypesToVariantsDatasetConverter extends ToVariantDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { - def call(v1: GenotypeRDD, v2: Dataset[VariantProduct]): VariantRDD = { + def call(v1: GenotypeDataset, v2: Dataset[VariantProduct]): VariantDataset = { ADAMContext.genotypesToVariantsDatasetConversionFn(v1, v2) } } -final class VariantsToContigsDatasetConverter extends ToContigDatasetConversion[Variant, VariantProduct, VariantRDD] { +final class VariantsToContigsDatasetConverter extends ToContigDatasetConversion[Variant, VariantProduct, VariantDataset] { - def call(v1: VariantRDD, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { + def call(v1: VariantDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { ADAMContext.variantsToContigsDatasetConversionFn(v1, v2) } } -final class VariantsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Variant, VariantProduct, VariantRDD] { +final class VariantsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Variant, VariantProduct, VariantDataset] { - def call(v1: VariantRDD, v2: Dataset[Coverage]): CoverageRDD = { + def call(v1: VariantDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.variantsToCoverageDatasetConversionFn(v1, v2) } } -final class VariantsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Variant, VariantProduct, VariantRDD] { +final class VariantsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Variant, VariantProduct, VariantDataset] { - def call(v1: VariantRDD, v2: Dataset[FeatureProduct]): FeatureRDD = { + def call(v1: VariantDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { ADAMContext.variantsToFeaturesDatasetConversionFn(v1, v2) } } -final class VariantsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Variant, VariantProduct, VariantRDD] { +final class VariantsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Variant, VariantProduct, VariantDataset] { - def call(v1: VariantRDD, v2: Dataset[FragmentProduct]): FragmentRDD = { + def call(v1: VariantDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { ADAMContext.variantsToFragmentsDatasetConversionFn(v1, v2) } } -final class VariantsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Variant, VariantProduct, VariantRDD] { +final class VariantsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Variant, VariantProduct, VariantDataset] { - def call(v1: VariantRDD, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { + def call(v1: VariantDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { ADAMContext.variantsToAlignmentRecordsDatasetConversionFn(v1, v2) } } -final class VariantsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Variant, VariantProduct, VariantRDD] { +final class VariantsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Variant, VariantProduct, VariantDataset] { - def call(v1: VariantRDD, v2: Dataset[GenotypeProduct]): GenotypeRDD = { + def call(v1: VariantDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { ADAMContext.variantsToGenotypesDatasetConversionFn(v1, v2) } } diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala index d35dd53842..cb10c9460a 100644 --- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala +++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala @@ -24,461 +24,461 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rdd.variant.{ - VariantRDD, - GenotypeRDD, - VariantContextRDD + VariantDataset, + GenotypeDataset, + VariantContextDataset } import org.bdgenomics.formats.avro._ -final class ContigsToContigsConverter extends Function2[NucleotideContigFragmentRDD, RDD[NucleotideContigFragment], NucleotideContigFragmentRDD] { +final class ContigsToContigsConverter extends Function2[NucleotideContigFragmentDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { ADAMContext.contigsToContigsConversionFn(v1, v2) } } -final class ContigsToCoverageConverter extends Function2[NucleotideContigFragmentRDD, RDD[Coverage], CoverageRDD] { +final class ContigsToCoverageConverter extends Function2[NucleotideContigFragmentDataset, RDD[Coverage], CoverageDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: RDD[Coverage]): CoverageRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: RDD[Coverage]): CoverageDataset = { ADAMContext.contigsToCoverageConversionFn(v1, v2) } } -final class ContigsToFeaturesConverter extends Function2[NucleotideContigFragmentRDD, RDD[Feature], FeatureRDD] { +final class ContigsToFeaturesConverter extends Function2[NucleotideContigFragmentDataset, RDD[Feature], FeatureDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: RDD[Feature]): FeatureRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: RDD[Feature]): FeatureDataset = { ADAMContext.contigsToFeaturesConversionFn(v1, v2) } } -final class ContigsToFragmentsConverter extends Function2[NucleotideContigFragmentRDD, RDD[Fragment], FragmentRDD] { +final class ContigsToFragmentsConverter extends Function2[NucleotideContigFragmentDataset, RDD[Fragment], FragmentDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: RDD[Fragment]): FragmentRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: RDD[Fragment]): FragmentDataset = { ADAMContext.contigsToFragmentsConversionFn(v1, v2) } } -final class ContigsToAlignmentRecordsConverter extends Function2[NucleotideContigFragmentRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +final class ContigsToAlignmentRecordsConverter extends Function2[NucleotideContigFragmentDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.contigsToAlignmentRecordsConversionFn(v1, v2) } } -final class ContigsToGenotypesConverter extends Function2[NucleotideContigFragmentRDD, RDD[Genotype], GenotypeRDD] { +final class ContigsToGenotypesConverter extends Function2[NucleotideContigFragmentDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: RDD[Genotype]): GenotypeRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: RDD[Genotype]): GenotypeDataset = { ADAMContext.contigsToGenotypesConversionFn(v1, v2) } } -final class ContigsToVariantsConverter extends Function2[NucleotideContigFragmentRDD, RDD[Variant], VariantRDD] { +final class ContigsToVariantsConverter extends Function2[NucleotideContigFragmentDataset, RDD[Variant], VariantDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: RDD[Variant]): VariantRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.contigsToVariantsConversionFn(v1, v2) } } -final class ContigsToVariantContextsConverter extends Function2[NucleotideContigFragmentRDD, RDD[VariantContext], VariantContextRDD] { +final class ContigsToVariantContextsConverter extends Function2[NucleotideContigFragmentDataset, RDD[VariantContext], VariantContextDataset] { - def call(v1: NucleotideContigFragmentRDD, v2: RDD[VariantContext]): VariantContextRDD = { + def call(v1: NucleotideContigFragmentDataset, v2: RDD[VariantContext]): VariantContextDataset = { ADAMContext.contigsToVariantContextConversionFn(v1, v2) } } -final class CoverageToContigsConverter extends Function2[CoverageRDD, RDD[NucleotideContigFragment], NucleotideContigFragmentRDD] { +final class CoverageToContigsConverter extends Function2[CoverageDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - def call(v1: CoverageRDD, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def call(v1: CoverageDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { ADAMContext.coverageToContigsConversionFn(v1, v2) } } -final class CoverageToCoverageConverter extends Function2[CoverageRDD, RDD[Coverage], CoverageRDD] { +final class CoverageToCoverageConverter extends Function2[CoverageDataset, RDD[Coverage], CoverageDataset] { - def call(v1: CoverageRDD, v2: RDD[Coverage]): CoverageRDD = { + def call(v1: CoverageDataset, v2: RDD[Coverage]): CoverageDataset = { ADAMContext.coverageToCoverageConversionFn(v1, v2) } } -final class CoverageToFeaturesConverter extends Function2[CoverageRDD, RDD[Feature], FeatureRDD] { +final class CoverageToFeaturesConverter extends Function2[CoverageDataset, RDD[Feature], FeatureDataset] { - def call(v1: CoverageRDD, v2: RDD[Feature]): FeatureRDD = { + def call(v1: CoverageDataset, v2: RDD[Feature]): FeatureDataset = { ADAMContext.coverageToFeaturesConversionFn(v1, v2) } } -final class CoverageToFragmentsConverter extends Function2[CoverageRDD, RDD[Fragment], FragmentRDD] { +final class CoverageToFragmentsConverter extends Function2[CoverageDataset, RDD[Fragment], FragmentDataset] { - def call(v1: CoverageRDD, v2: RDD[Fragment]): FragmentRDD = { + def call(v1: CoverageDataset, v2: RDD[Fragment]): FragmentDataset = { ADAMContext.coverageToFragmentsConversionFn(v1, v2) } } -final class CoverageToAlignmentRecordsConverter extends Function2[CoverageRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +final class CoverageToAlignmentRecordsConverter extends Function2[CoverageDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: CoverageRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: CoverageDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.coverageToAlignmentRecordsConversionFn(v1, v2) } } -final class CoverageToGenotypesConverter extends Function2[CoverageRDD, RDD[Genotype], GenotypeRDD] { +final class CoverageToGenotypesConverter extends Function2[CoverageDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: CoverageRDD, v2: RDD[Genotype]): GenotypeRDD = { + def call(v1: CoverageDataset, v2: RDD[Genotype]): GenotypeDataset = { ADAMContext.coverageToGenotypesConversionFn(v1, v2) } } -final class CoverageToVariantsConverter extends Function2[CoverageRDD, RDD[Variant], VariantRDD] { +final class CoverageToVariantsConverter extends Function2[CoverageDataset, RDD[Variant], VariantDataset] { - def call(v1: CoverageRDD, v2: RDD[Variant]): VariantRDD = { + def call(v1: CoverageDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.coverageToVariantsConversionFn(v1, v2) } } -final class CoverageToVariantContextConverter extends Function2[CoverageRDD, RDD[VariantContext], VariantContextRDD] { +final class CoverageToVariantContextConverter extends Function2[CoverageDataset, RDD[VariantContext], VariantContextDataset] { - def call(v1: CoverageRDD, v2: RDD[VariantContext]): VariantContextRDD = { + def call(v1: CoverageDataset, v2: RDD[VariantContext]): VariantContextDataset = { ADAMContext.coverageToVariantContextConversionFn(v1, v2) } } -final class FeaturesToContigsConverter extends Function2[FeatureRDD, RDD[NucleotideContigFragment], NucleotideContigFragmentRDD] { +final class FeaturesToContigsConverter extends Function2[FeatureDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - def call(v1: FeatureRDD, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def call(v1: FeatureDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { ADAMContext.featuresToContigsConversionFn(v1, v2) } } -final class FeaturesToCoverageConverter extends Function2[FeatureRDD, RDD[Coverage], CoverageRDD] { +final class FeaturesToCoverageConverter extends Function2[FeatureDataset, RDD[Coverage], CoverageDataset] { - def call(v1: FeatureRDD, v2: RDD[Coverage]): CoverageRDD = { + def call(v1: FeatureDataset, v2: RDD[Coverage]): CoverageDataset = { ADAMContext.featuresToCoverageConversionFn(v1, v2) } } -final class FeaturesToFeatureConverter extends Function2[FeatureRDD, RDD[Feature], FeatureRDD] { +final class FeaturesToFeatureConverter extends Function2[FeatureDataset, RDD[Feature], FeatureDataset] { - def call(v1: FeatureRDD, v2: RDD[Feature]): FeatureRDD = { + def call(v1: FeatureDataset, v2: RDD[Feature]): FeatureDataset = { ADAMContext.featuresToFeaturesConversionFn(v1, v2) } } -final class FeaturesToFragmentsConverter extends Function2[FeatureRDD, RDD[Fragment], FragmentRDD] { +final class FeaturesToFragmentsConverter extends Function2[FeatureDataset, RDD[Fragment], FragmentDataset] { - def call(v1: FeatureRDD, v2: RDD[Fragment]): FragmentRDD = { + def call(v1: FeatureDataset, v2: RDD[Fragment]): FragmentDataset = { ADAMContext.featuresToFragmentsConversionFn(v1, v2) } } -final class FeaturesToAlignmentRecordsConverter extends Function2[FeatureRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +final class FeaturesToAlignmentRecordsConverter extends Function2[FeatureDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: FeatureRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: FeatureDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.featuresToAlignmentRecordsConversionFn(v1, v2) } } -final class FeaturesToGenotypesConverter extends Function2[FeatureRDD, RDD[Genotype], GenotypeRDD] { +final class FeaturesToGenotypesConverter extends Function2[FeatureDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: FeatureRDD, v2: RDD[Genotype]): GenotypeRDD = { + def call(v1: FeatureDataset, v2: RDD[Genotype]): GenotypeDataset = { ADAMContext.featuresToGenotypesConversionFn(v1, v2) } } -final class FeaturesToVariantsConverter extends Function2[FeatureRDD, RDD[Variant], VariantRDD] { +final class FeaturesToVariantsConverter extends Function2[FeatureDataset, RDD[Variant], VariantDataset] { - def call(v1: FeatureRDD, v2: RDD[Variant]): VariantRDD = { + def call(v1: FeatureDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.featuresToVariantsConversionFn(v1, v2) } } -final class FeaturesToVariantContextConverter extends Function2[FeatureRDD, RDD[VariantContext], VariantContextRDD] { +final class FeaturesToVariantContextConverter extends Function2[FeatureDataset, RDD[VariantContext], VariantContextDataset] { - def call(v1: FeatureRDD, v2: RDD[VariantContext]): VariantContextRDD = { + def call(v1: FeatureDataset, v2: RDD[VariantContext]): VariantContextDataset = { ADAMContext.featuresToVariantContextConversionFn(v1, v2) } } -final class FragmentsToContigsConverter extends Function2[FragmentRDD, RDD[NucleotideContigFragment], NucleotideContigFragmentRDD] { +final class FragmentsToContigsConverter extends Function2[FragmentDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - def call(v1: FragmentRDD, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def call(v1: FragmentDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { ADAMContext.fragmentsToContigsConversionFn(v1, v2) } } -final class FragmentsToCoverageConverter extends Function2[FragmentRDD, RDD[Coverage], CoverageRDD] { +final class FragmentsToCoverageConverter extends Function2[FragmentDataset, RDD[Coverage], CoverageDataset] { - def call(v1: FragmentRDD, v2: RDD[Coverage]): CoverageRDD = { + def call(v1: FragmentDataset, v2: RDD[Coverage]): CoverageDataset = { ADAMContext.fragmentsToCoverageConversionFn(v1, v2) } } -final class FragmentsToFeaturesConverter extends Function2[FragmentRDD, RDD[Feature], FeatureRDD] { +final class FragmentsToFeaturesConverter extends Function2[FragmentDataset, RDD[Feature], FeatureDataset] { - def call(v1: FragmentRDD, v2: RDD[Feature]): FeatureRDD = { + def call(v1: FragmentDataset, v2: RDD[Feature]): FeatureDataset = { ADAMContext.fragmentsToFeaturesConversionFn(v1, v2) } } -final class FragmentsToFragmentsConverter extends Function2[FragmentRDD, RDD[Fragment], FragmentRDD] { +final class FragmentsToFragmentsConverter extends Function2[FragmentDataset, RDD[Fragment], FragmentDataset] { - def call(v1: FragmentRDD, v2: RDD[Fragment]): FragmentRDD = { + def call(v1: FragmentDataset, v2: RDD[Fragment]): FragmentDataset = { ADAMContext.fragmentsToFragmentsConversionFn(v1, v2) } } -final class FragmentsToAlignmentRecordsConverter extends Function2[FragmentRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +final class FragmentsToAlignmentRecordsConverter extends Function2[FragmentDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: FragmentRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: FragmentDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.fragmentsToAlignmentRecordsConversionFn(v1, v2) } } -final class FragmentsToGenotypesConverter extends Function2[FragmentRDD, RDD[Genotype], GenotypeRDD] { +final class FragmentsToGenotypesConverter extends Function2[FragmentDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: FragmentRDD, v2: RDD[Genotype]): GenotypeRDD = { + def call(v1: FragmentDataset, v2: RDD[Genotype]): GenotypeDataset = { ADAMContext.fragmentsToGenotypesConversionFn(v1, v2) } } -final class FragmentsToVariantsConverter extends Function2[FragmentRDD, RDD[Variant], VariantRDD] { +final class FragmentsToVariantsConverter extends Function2[FragmentDataset, RDD[Variant], VariantDataset] { - def call(v1: FragmentRDD, v2: RDD[Variant]): VariantRDD = { + def call(v1: FragmentDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.fragmentsToVariantsConversionFn(v1, v2) } } -final class FragmentsToVariantContextConverter extends Function2[FragmentRDD, RDD[VariantContext], VariantContextRDD] { +final class FragmentsToVariantContextConverter extends Function2[FragmentDataset, RDD[VariantContext], VariantContextDataset] { - def call(v1: FragmentRDD, v2: RDD[VariantContext]): VariantContextRDD = { + def call(v1: FragmentDataset, v2: RDD[VariantContext]): VariantContextDataset = { ADAMContext.fragmentsToVariantContextConversionFn(v1, v2) } } -final class AlignmentRecordsToContigsConverter extends Function2[AlignmentRecordRDD, RDD[NucleotideContigFragment], NucleotideContigFragmentRDD] { +final class AlignmentRecordsToContigsConverter extends Function2[AlignmentRecordDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { ADAMContext.alignmentRecordsToContigsConversionFn(v1, v2) } } -final class AlignmentRecordsToCoverageConverter extends Function2[AlignmentRecordRDD, RDD[Coverage], CoverageRDD] { +final class AlignmentRecordsToCoverageConverter extends Function2[AlignmentRecordDataset, RDD[Coverage], CoverageDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[Coverage]): CoverageRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[Coverage]): CoverageDataset = { ADAMContext.alignmentRecordsToCoverageConversionFn(v1, v2) } } -final class AlignmentRecordsToFeaturesConverter extends Function2[AlignmentRecordRDD, RDD[Feature], FeatureRDD] { +final class AlignmentRecordsToFeaturesConverter extends Function2[AlignmentRecordDataset, RDD[Feature], FeatureDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[Feature]): FeatureRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[Feature]): FeatureDataset = { ADAMContext.alignmentRecordsToFeaturesConversionFn(v1, v2) } } -final class AlignmentRecordsToFragmentsConverter extends Function2[AlignmentRecordRDD, RDD[Fragment], FragmentRDD] { +final class AlignmentRecordsToFragmentsConverter extends Function2[AlignmentRecordDataset, RDD[Fragment], FragmentDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[Fragment]): FragmentRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[Fragment]): FragmentDataset = { ADAMContext.alignmentRecordsToFragmentsConversionFn(v1, v2) } } -final class AlignmentRecordsToAlignmentRecordsConverter extends Function2[AlignmentRecordRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +final class AlignmentRecordsToAlignmentRecordsConverter extends Function2[AlignmentRecordDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.alignmentRecordsToAlignmentRecordsConversionFn(v1, v2) } } -final class AlignmentRecordsToGenotypesConverter extends Function2[AlignmentRecordRDD, RDD[Genotype], GenotypeRDD] { +final class AlignmentRecordsToGenotypesConverter extends Function2[AlignmentRecordDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[Genotype]): GenotypeRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[Genotype]): GenotypeDataset = { ADAMContext.alignmentRecordsToGenotypesConversionFn(v1, v2) } } -final class AlignmentRecordsToVariantsConverter extends Function2[AlignmentRecordRDD, RDD[Variant], VariantRDD] { +final class AlignmentRecordsToVariantsConverter extends Function2[AlignmentRecordDataset, RDD[Variant], VariantDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[Variant]): VariantRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.alignmentRecordsToVariantsConversionFn(v1, v2) } } -final class AlignmentRecordsToVariantContextConverter extends Function2[AlignmentRecordRDD, RDD[VariantContext], VariantContextRDD] { +final class AlignmentRecordsToVariantContextConverter extends Function2[AlignmentRecordDataset, RDD[VariantContext], VariantContextDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[VariantContext]): VariantContextRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[VariantContext]): VariantContextDataset = { ADAMContext.alignmentRecordsToVariantContextConversionFn(v1, v2) } } -final class GenotypesToContigsConverter extends Function2[GenotypeRDD, RDD[NucleotideContigFragment], NucleotideContigFragmentRDD] { +final class GenotypesToContigsConverter extends Function2[GenotypeDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - def call(v1: GenotypeRDD, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def call(v1: GenotypeDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { ADAMContext.genotypesToContigsConversionFn(v1, v2) } } -final class GenotypesToCoverageConverter extends Function2[GenotypeRDD, RDD[Coverage], CoverageRDD] { +final class GenotypesToCoverageConverter extends Function2[GenotypeDataset, RDD[Coverage], CoverageDataset] { - def call(v1: GenotypeRDD, v2: RDD[Coverage]): CoverageRDD = { + def call(v1: GenotypeDataset, v2: RDD[Coverage]): CoverageDataset = { ADAMContext.genotypesToCoverageConversionFn(v1, v2) } } -final class GenotypesToFeaturesConverter extends Function2[GenotypeRDD, RDD[Feature], FeatureRDD] { +final class GenotypesToFeaturesConverter extends Function2[GenotypeDataset, RDD[Feature], FeatureDataset] { - def call(v1: GenotypeRDD, v2: RDD[Feature]): FeatureRDD = { + def call(v1: GenotypeDataset, v2: RDD[Feature]): FeatureDataset = { ADAMContext.genotypesToFeaturesConversionFn(v1, v2) } } -final class GenotypesToFragmentsConverter extends Function2[GenotypeRDD, RDD[Fragment], FragmentRDD] { +final class GenotypesToFragmentsConverter extends Function2[GenotypeDataset, RDD[Fragment], FragmentDataset] { - def call(v1: GenotypeRDD, v2: RDD[Fragment]): FragmentRDD = { + def call(v1: GenotypeDataset, v2: RDD[Fragment]): FragmentDataset = { ADAMContext.genotypesToFragmentsConversionFn(v1, v2) } } -final class GenotypesToAlignmentRecordsConverter extends Function2[GenotypeRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +final class GenotypesToAlignmentRecordsConverter extends Function2[GenotypeDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: GenotypeRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: GenotypeDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.genotypesToAlignmentRecordsConversionFn(v1, v2) } } -final class GenotypesToGenotypesConverter extends Function2[GenotypeRDD, RDD[Genotype], GenotypeRDD] { +final class GenotypesToGenotypesConverter extends Function2[GenotypeDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: GenotypeRDD, v2: RDD[Genotype]): GenotypeRDD = { + def call(v1: GenotypeDataset, v2: RDD[Genotype]): GenotypeDataset = { ADAMContext.genotypesToGenotypesConversionFn(v1, v2) } } -final class GenotypesToVariantsConverter extends Function2[GenotypeRDD, RDD[Variant], VariantRDD] { +final class GenotypesToVariantsConverter extends Function2[GenotypeDataset, RDD[Variant], VariantDataset] { - def call(v1: GenotypeRDD, v2: RDD[Variant]): VariantRDD = { + def call(v1: GenotypeDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.genotypesToVariantsConversionFn(v1, v2) } } -final class GenotypesToVariantContextConverter extends Function2[GenotypeRDD, RDD[VariantContext], VariantContextRDD] { +final class GenotypesToVariantContextConverter extends Function2[GenotypeDataset, RDD[VariantContext], VariantContextDataset] { - def call(v1: GenotypeRDD, v2: RDD[VariantContext]): VariantContextRDD = { + def call(v1: GenotypeDataset, v2: RDD[VariantContext]): VariantContextDataset = { ADAMContext.genotypesToVariantContextConversionFn(v1, v2) } } -final class VariantsToContigsConverter extends Function2[VariantRDD, RDD[NucleotideContigFragment], NucleotideContigFragmentRDD] { +final class VariantsToContigsConverter extends Function2[VariantDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - def call(v1: VariantRDD, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def call(v1: VariantDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { ADAMContext.variantsToContigsConversionFn(v1, v2) } } -final class VariantsToCoverageConverter extends Function2[VariantRDD, RDD[Coverage], CoverageRDD] { +final class VariantsToCoverageConverter extends Function2[VariantDataset, RDD[Coverage], CoverageDataset] { - def call(v1: VariantRDD, v2: RDD[Coverage]): CoverageRDD = { + def call(v1: VariantDataset, v2: RDD[Coverage]): CoverageDataset = { ADAMContext.variantsToCoverageConversionFn(v1, v2) } } -final class VariantsToFeaturesConverter extends Function2[VariantRDD, RDD[Feature], FeatureRDD] { +final class VariantsToFeaturesConverter extends Function2[VariantDataset, RDD[Feature], FeatureDataset] { - def call(v1: VariantRDD, v2: RDD[Feature]): FeatureRDD = { + def call(v1: VariantDataset, v2: RDD[Feature]): FeatureDataset = { ADAMContext.variantsToFeaturesConversionFn(v1, v2) } } -final class VariantsToFragmentsConverter extends Function2[VariantRDD, RDD[Fragment], FragmentRDD] { +final class VariantsToFragmentsConverter extends Function2[VariantDataset, RDD[Fragment], FragmentDataset] { - def call(v1: VariantRDD, v2: RDD[Fragment]): FragmentRDD = { + def call(v1: VariantDataset, v2: RDD[Fragment]): FragmentDataset = { ADAMContext.variantsToFragmentsConversionFn(v1, v2) } } -final class VariantsToAlignmentRecordsConverter extends Function2[VariantRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +final class VariantsToAlignmentRecordsConverter extends Function2[VariantDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: VariantRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: VariantDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.variantsToAlignmentRecordsConversionFn(v1, v2) } } -final class VariantsToGenotypesConverter extends Function2[VariantRDD, RDD[Genotype], GenotypeRDD] { +final class VariantsToGenotypesConverter extends Function2[VariantDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: VariantRDD, v2: RDD[Genotype]): GenotypeRDD = { + def call(v1: VariantDataset, v2: RDD[Genotype]): GenotypeDataset = { ADAMContext.variantsToGenotypesConversionFn(v1, v2) } } -final class VariantsToVariantsConverter extends Function2[VariantRDD, RDD[Variant], VariantRDD] { +final class VariantsToVariantsConverter extends Function2[VariantDataset, RDD[Variant], VariantDataset] { - def call(v1: VariantRDD, v2: RDD[Variant]): VariantRDD = { + def call(v1: VariantDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.variantsToVariantsConversionFn(v1, v2) } } -final class VariantsToVariantContextConverter extends Function2[VariantRDD, RDD[VariantContext], VariantContextRDD] { +final class VariantsToVariantContextConverter extends Function2[VariantDataset, RDD[VariantContext], VariantContextDataset] { - def call(v1: VariantRDD, v2: RDD[VariantContext]): VariantContextRDD = { + def call(v1: VariantDataset, v2: RDD[VariantContext]): VariantContextDataset = { ADAMContext.variantsToVariantContextConversionFn(v1, v2) } } -final class VariantContextsToContigsConverter extends Function2[VariantContextRDD, RDD[NucleotideContigFragment], NucleotideContigFragmentRDD] { +final class VariantContextsToContigsConverter extends Function2[VariantContextDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - def call(v1: VariantContextRDD, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def call(v1: VariantContextDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { ADAMContext.variantContextsToContigsConversionFn(v1, v2) } } -final class VariantContextsToCoverageConverter extends Function2[VariantContextRDD, RDD[Coverage], CoverageRDD] { +final class VariantContextsToCoverageConverter extends Function2[VariantContextDataset, RDD[Coverage], CoverageDataset] { - def call(v1: VariantContextRDD, v2: RDD[Coverage]): CoverageRDD = { + def call(v1: VariantContextDataset, v2: RDD[Coverage]): CoverageDataset = { ADAMContext.variantContextsToCoverageConversionFn(v1, v2) } } -final class VariantContextsToFeaturesConverter extends Function2[VariantContextRDD, RDD[Feature], FeatureRDD] { +final class VariantContextsToFeaturesConverter extends Function2[VariantContextDataset, RDD[Feature], FeatureDataset] { - def call(v1: VariantContextRDD, v2: RDD[Feature]): FeatureRDD = { + def call(v1: VariantContextDataset, v2: RDD[Feature]): FeatureDataset = { ADAMContext.variantContextsToFeaturesConversionFn(v1, v2) } } -final class VariantContextsToFragmentsConverter extends Function2[VariantContextRDD, RDD[Fragment], FragmentRDD] { +final class VariantContextsToFragmentsConverter extends Function2[VariantContextDataset, RDD[Fragment], FragmentDataset] { - def call(v1: VariantContextRDD, v2: RDD[Fragment]): FragmentRDD = { + def call(v1: VariantContextDataset, v2: RDD[Fragment]): FragmentDataset = { ADAMContext.variantContextsToFragmentsConversionFn(v1, v2) } } -final class VariantContextsToAlignmentRecordsConverter extends Function2[VariantContextRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +final class VariantContextsToAlignmentRecordsConverter extends Function2[VariantContextDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: VariantContextRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: VariantContextDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.variantContextsToAlignmentRecordsConversionFn(v1, v2) } } -final class VariantContextsToGenotypesConverter extends Function2[VariantContextRDD, RDD[Genotype], GenotypeRDD] { +final class VariantContextsToGenotypesConverter extends Function2[VariantContextDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: VariantContextRDD, v2: RDD[Genotype]): GenotypeRDD = { + def call(v1: VariantContextDataset, v2: RDD[Genotype]): GenotypeDataset = { ADAMContext.variantContextsToGenotypesConversionFn(v1, v2) } } -final class VariantContextsToVariantsConverter extends Function2[VariantContextRDD, RDD[Variant], VariantRDD] { +final class VariantContextsToVariantsConverter extends Function2[VariantContextDataset, RDD[Variant], VariantDataset] { - def call(v1: VariantContextRDD, v2: RDD[Variant]): VariantRDD = { + def call(v1: VariantContextDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.variantContextsToVariantsConversionFn(v1, v2) } } -final class VariantContextsToVariantContextConverter extends Function2[VariantContextRDD, RDD[VariantContext], VariantContextRDD] { +final class VariantContextsToVariantContextConverter extends Function2[VariantContextDataset, RDD[VariantContext], VariantContextDataset] { - def call(v1: VariantContextRDD, v2: RDD[VariantContext]): VariantContextRDD = { + def call(v1: VariantContextDataset, v2: RDD[VariantContext]): VariantContextDataset = { ADAMContext.variantContextsToVariantContextsConversionFn(v1, v2) } } diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala index 59a9ad4ecf..4135a40e6a 100644 --- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala +++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala @@ -21,13 +21,13 @@ import htsjdk.samtools.ValidationStringency import org.apache.spark.api.java.JavaSparkContext import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.adam.rdd.ADAMContext -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD + GenotypeDataset, + VariantDataset } import org.bdgenomics.adam.util.ReferenceFile import scala.collection.JavaConversions._ @@ -51,7 +51,7 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { def getSparkContext: JavaSparkContext = new JavaSparkContext(ac.sc) /** - * Load alignment records into an AlignmentRecordRDD (java-friendly method). + * Load alignment records into an AlignmentRecordDataset (java-friendly method). * * Loads path names ending in: * * .bam/.cram/.sam as BAM/CRAM/SAM format, @@ -70,16 +70,16 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * @param pathName The path name to load alignment records from. * Globs/directories are supported, although file extension must be present * for BAM/CRAM/SAM, FASTA, and FASTQ formats. - * @return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, + * @return Returns an AlignmentRecordDataset which wraps the genomic dataset of alignment records, * sequence dictionary representing contigs the alignment records may be aligned to, * and the record group dictionary for the alignment records if one is available. */ - def loadAlignments(pathName: java.lang.String): AlignmentRecordRDD = { + def loadAlignments(pathName: java.lang.String): AlignmentRecordDataset = { ac.loadAlignments(pathName) } /** - * Load alignment records into an AlignmentRecordRDD (java-friendly method). + * Load alignment records into an AlignmentRecordDataset (java-friendly method). * * Loads path names ending in: * * .bam/.cram/.sam as BAM/CRAM/SAM format, @@ -100,12 +100,12 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * for BAM/CRAM/SAM, FASTA, and FASTQ formats. * @param stringency The validation stringency to use when validating * BAM/CRAM/SAM or FASTQ formats. - * @return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, + * @return Returns an AlignmentRecordDataset which wraps the genomic dataset of alignment records, * sequence dictionary representing contigs the alignment records may be aligned to, * and the record group dictionary for the alignment records if one is available. */ def loadAlignments(pathName: java.lang.String, - stringency: ValidationStringency): AlignmentRecordRDD = { + stringency: ValidationStringency): AlignmentRecordDataset = { ac.loadAlignments(pathName, stringency = stringency) } @@ -119,20 +119,20 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * @param viewRegions Iterable of ReferenceRegion we are filtering on. * @param stringency The validation stringency to use when validating the * BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT. - * @return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, + * @return Returns an AlignmentRecordDataset which wraps the genomic dataset of alignment records, * sequence dictionary representing contigs the alignment records may be aligned to, * and the record group dictionary for the alignment records if one is available. */ def loadIndexedBam( pathName: String, viewRegions: java.util.List[ReferenceRegion], - stringency: ValidationStringency): AlignmentRecordRDD = { + stringency: ValidationStringency): AlignmentRecordDataset = { ac.loadIndexedBam(pathName, viewRegions.toIterable, stringency = stringency) } /** - * Load nucleotide contig fragments into a NucleotideContigFragmentRDD (java-friendly method). + * Load nucleotide contig fragments into a NucleotideContigFragmentDataset (java-friendly method). * * If the path name has a .fa/.fasta extension, load as FASTA format. * Else, fall back to Parquet + Avro. @@ -145,14 +145,14 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * @param pathName The path name to load nucleotide contig fragments from. * Globs/directories are supported, although file extension must be present * for FASTA format. - * @return Returns a NucleotideContigFragmentRDD. + * @return Returns a NucleotideContigFragmentDataset. */ - def loadContigFragments(pathName: java.lang.String): NucleotideContigFragmentRDD = { + def loadContigFragments(pathName: java.lang.String): NucleotideContigFragmentDataset = { ac.loadContigFragments(pathName) } /** - * Load fragments into a FragmentRDD (java-friendly method). + * Load fragments into a FragmentDataset (java-friendly method). * * Loads path names ending in: * * .bam/.cram/.sam as BAM/CRAM/SAM format and @@ -168,14 +168,14 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * @param pathName The path name to load fragments from. * Globs/directories are supported, although file extension must be present * for BAM/CRAM/SAM and FASTQ formats. - * @return Returns a FragmentRDD. + * @return Returns a FragmentDataset. */ - def loadFragments(pathName: java.lang.String): FragmentRDD = { + def loadFragments(pathName: java.lang.String): FragmentDataset = { ac.loadFragments(pathName) } /** - * Load fragments into a FragmentRDD (java-friendly method). + * Load fragments into a FragmentDataset (java-friendly method). * * Loads path names ending in: * * .bam/.cram/.sam as BAM/CRAM/SAM format and @@ -192,15 +192,15 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * Globs/directories are supported, although file extension must be present * for BAM/CRAM/SAM and FASTQ formats. * @param stringency The validation stringency to use when validating BAM/CRAM/SAM or FASTQ formats. - * @return Returns a FragmentRDD. + * @return Returns a FragmentDataset. */ def loadFragments(pathName: java.lang.String, - stringency: ValidationStringency): FragmentRDD = { + stringency: ValidationStringency): FragmentDataset = { ac.loadFragments(pathName, stringency = stringency) } /** - * Load features into a FeatureRDD (java-friendly method). + * Load features into a FeatureDataset (java-friendly method). * * Loads path names ending in: * * .bed as BED6/12 format, @@ -220,14 +220,14 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * @param pathName The path name to load features from. * Globs/directories are supported, although file extension must be present * for BED6/12, GFF3, GTF/GFF2, NarrowPeak, or IntervalList formats. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ - def loadFeatures(pathName: java.lang.String): FeatureRDD = { + def loadFeatures(pathName: java.lang.String): FeatureDataset = { ac.loadFeatures(pathName) } /** - * Load features into a FeatureRDD (java-friendly method). + * Load features into a FeatureDataset (java-friendly method). * * Loads path names ending in: * * .bed as BED6/12 format, @@ -249,15 +249,15 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * for BED6/12, GFF3, GTF/GFF2, NarrowPeak, or IntervalList formats. * @param stringency The validation stringency to use when validating BED6/12, GFF3, * GTF/GFF2, NarrowPeak, or IntervalList formats. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadFeatures(pathName: java.lang.String, - stringency: ValidationStringency): FeatureRDD = { + stringency: ValidationStringency): FeatureDataset = { ac.loadFeatures(pathName, stringency = stringency) } /** - * Load features into a FeatureRDD and convert to a CoverageRDD (java-friendly method). + * Load features into a FeatureDataset and convert to a CoverageDataset (java-friendly method). * Coverage is stored in the score field of Feature. * * Loads path names ending in: @@ -278,14 +278,14 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * @param pathName The path name to load features from. * Globs/directories are supported, although file extension must be present * for BED6/12, GFF3, GTF/GFF2, NarrowPeak, or IntervalList formats. - * @return Returns a FeatureRDD converted to a CoverageRDD. + * @return Returns a FeatureDataset converted to a CoverageDataset. */ - def loadCoverage(pathName: java.lang.String): CoverageRDD = { + def loadCoverage(pathName: java.lang.String): CoverageDataset = { ac.loadCoverage(pathName) } /** - * Load features into a FeatureRDD and convert to a CoverageRDD (java-friendly method). + * Load features into a FeatureDataset and convert to a CoverageDataset (java-friendly method). * Coverage is stored in the score field of Feature. * * Loads path names ending in: @@ -308,16 +308,16 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * for BED6/12, GFF3, GTF/GFF2, NarrowPeak, or IntervalList formats. * @param stringency The validation stringency to use when validating BED6/12, GFF3, * GTF/GFF2, NarrowPeak, or IntervalList formats. - * @return Returns a FeatureRDD converted to a CoverageRDD. + * @return Returns a FeatureDataset converted to a CoverageDataset. */ def loadCoverage(pathName: java.lang.String, - stringency: ValidationStringency): CoverageRDD = { + stringency: ValidationStringency): CoverageDataset = { ac.loadCoverage(pathName, stringency = stringency) } /** - * Load genotypes into a GenotypeRDD (java-friendly method). + * Load genotypes into a GenotypeDataset (java-friendly method). * * If the path name has a .vcf/.vcf.gz/.vcf.bgzf/.vcf.bgz extension, load as VCF format. * Else, fall back to Parquet + Avro. @@ -327,14 +327,14 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * @param pathName The path name to load genotypes from. * Globs/directories are supported, although file extension must be present * for VCF format. - * @return Returns a GenotypeRDD. + * @return Returns a GenotypeDataset. */ - def loadGenotypes(pathName: java.lang.String): GenotypeRDD = { + def loadGenotypes(pathName: java.lang.String): GenotypeDataset = { ac.loadGenotypes(pathName) } /** - * Load genotypes into a GenotypeRDD (java-friendly method). + * Load genotypes into a GenotypeDataset (java-friendly method). * * If the path name has a .vcf/.vcf.gz/.vcf.bgzf/.vcf.bgz extension, load as VCF format. * Else, fall back to Parquet + Avro. @@ -345,16 +345,16 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * Globs/directories are supported, although file extension must be present * for VCF format. * @param stringency The validation stringency to use when validating VCF format. - * @return Returns a GenotypeRDD. + * @return Returns a GenotypeDataset. */ def loadGenotypes(pathName: java.lang.String, - stringency: ValidationStringency): GenotypeRDD = { + stringency: ValidationStringency): GenotypeDataset = { ac.loadGenotypes(pathName, stringency = stringency) } /** - * Load variants into a VariantRDD (java-friendly method). + * Load variants into a VariantDataset (java-friendly method). * * If the path name has a .vcf/.vcf.gz/.vcf.bgzf/.vcf.bgz extension, load as VCF format. * Else, fall back to Parquet + Avro. @@ -363,14 +363,14 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * * @param pathName The path name to load variants from. * Globs/directories are supported, although file extension must be present for VCF format. - * @return Returns a VariantRDD. + * @return Returns a VariantDataset. */ - def loadVariants(pathName: java.lang.String): VariantRDD = { + def loadVariants(pathName: java.lang.String): VariantDataset = { ac.loadVariants(pathName) } /** - * Load variants into a VariantRDD (java-friendly method). + * Load variants into a VariantDataset (java-friendly method). * * If the path name has a .vcf/.vcf.gz/.vcf.bgzf/.vcf.bgz extension, load as VCF format. * Else, fall back to Parquet + Avro. @@ -380,10 +380,10 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { * @param pathName The path name to load variants from. * Globs/directories are supported, although file extension must be present for VCF format. * @param stringency The validation stringency to use when validating VCF format. - * @return Returns a VariantRDD. + * @return Returns a VariantDataset. */ def loadVariants(pathName: java.lang.String, - stringency: ValidationStringency): VariantRDD = { + stringency: ValidationStringency): VariantDataset = { ac.loadVariants(pathName, stringency = stringency) } diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java index 1c32b1b461..fe732bb02c 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java @@ -21,15 +21,15 @@ import java.nio.file.Files; import java.nio.file.Path; import org.bdgenomics.adam.rdd.ADAMContext; -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD; +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset; /** * A simple test class for the JavaADAMRDD/Context. Writes an RDD of nucleotide * contig fragments to disk and reads it back. */ final class JavaADAMContigConduit { - public static NucleotideContigFragmentRDD conduit(final NucleotideContigFragmentRDD recordRdd, - final ADAMContext ac) throws IOException { + public static NucleotideContigFragmentDataset conduit(final NucleotideContigFragmentDataset recordRdd, + final ADAMContext ac) throws IOException { // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMCoverageConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMCoverageConduit.java index 2794ccfce0..ee1f31050d 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMCoverageConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMCoverageConduit.java @@ -21,15 +21,15 @@ import java.nio.file.Files; import java.nio.file.Path; import org.bdgenomics.adam.rdd.ADAMContext; -import org.bdgenomics.adam.rdd.feature.CoverageRDD; +import org.bdgenomics.adam.rdd.feature.CoverageDataset; /** * A simple test class for the JavaADAMRDD/Context. Writes an RDD of coverage to * disk and reads it back. */ final class JavaADAMCoverageConduit { - public static CoverageRDD conduit(final CoverageRDD recordRdd, - final ADAMContext ac) throws IOException { + public static CoverageDataset conduit(final CoverageDataset recordRdd, + final ADAMContext ac) throws IOException { // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFeatureConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFeatureConduit.java index 3527e4e4d5..5e00188d0f 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFeatureConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFeatureConduit.java @@ -21,15 +21,15 @@ import java.nio.file.Files; import java.nio.file.Path; import org.bdgenomics.adam.rdd.ADAMContext; -import org.bdgenomics.adam.rdd.feature.FeatureRDD; +import org.bdgenomics.adam.rdd.feature.FeatureDataset; /** * A simple test class for the JavaADAMRDD/Context. Writes an RDD of features to * disk and reads it back. */ final class JavaADAMFeatureConduit { - public static FeatureRDD conduit(final FeatureRDD recordRdd, - final ADAMContext ac) throws IOException { + public static FeatureDataset conduit(final FeatureDataset recordRdd, + final ADAMContext ac) throws IOException { // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFragmentConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFragmentConduit.java index 21daae968e..4b6b2cbfc3 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFragmentConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMFragmentConduit.java @@ -21,15 +21,15 @@ import java.nio.file.Files; import java.nio.file.Path; import org.bdgenomics.adam.rdd.ADAMContext; -import org.bdgenomics.adam.rdd.fragment.FragmentRDD; +import org.bdgenomics.adam.rdd.fragment.FragmentDataset; /** * A simple test class for the JavaADAMRDD/Context. Writes an RDD of fragments to * disk and reads it back. */ final class JavaADAMFragmentConduit { - public static FragmentRDD conduit(final FragmentRDD recordRdd, - final ADAMContext ac) throws IOException { + public static FragmentDataset conduit(final FragmentDataset recordRdd, + final ADAMContext ac) throws IOException { // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMGenotypeConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMGenotypeConduit.java index ebd90f8f26..51f1a59246 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMGenotypeConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMGenotypeConduit.java @@ -21,15 +21,15 @@ import java.nio.file.Files; import java.nio.file.Path; import org.bdgenomics.adam.rdd.ADAMContext; -import org.bdgenomics.adam.rdd.variant.GenotypeRDD; +import org.bdgenomics.adam.rdd.variant.GenotypeDataset; /** * A simple test class for the JavaADAMRDD/Context. Writes an RDD of annotations to * disk and reads it back. */ final class JavaADAMGenotypeConduit { - public static GenotypeRDD conduit(final GenotypeRDD recordRdd, - final ADAMContext ac) throws IOException { + public static GenotypeDataset conduit(final GenotypeDataset recordRdd, + final ADAMContext ac) throws IOException { // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMReadConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMReadConduit.java index ea33d73170..30baadd591 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMReadConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMReadConduit.java @@ -21,15 +21,15 @@ import java.nio.file.Files; import java.nio.file.Path; import org.bdgenomics.adam.rdd.ADAMContext; -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD; +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset; /** * A simple test class for the JavaADAMRDD/Context. Writes an RDD of reads to * disk and reads it back. */ class JavaADAMReadConduit { - public static AlignmentRecordRDD conduit(final AlignmentRecordRDD recordRdd, - final ADAMContext ac) throws IOException { + public static AlignmentRecordDataset conduit(final AlignmentRecordDataset recordRdd, + final ADAMContext ac) throws IOException { // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMVariantConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMVariantConduit.java index b1e5eaba10..ae47300fdd 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMVariantConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMVariantConduit.java @@ -21,15 +21,15 @@ import java.nio.file.Files; import java.nio.file.Path; import org.bdgenomics.adam.rdd.ADAMContext; -import org.bdgenomics.adam.rdd.variant.VariantRDD; +import org.bdgenomics.adam.rdd.variant.VariantDataset; /** * A simple test class for the JavaADAMRDD/Context. Writes an RDD of annotations to * disk and reads it back. */ final class JavaADAMVariantConduit { - public static VariantRDD conduit(final VariantRDD recordRdd, - final ADAMContext ac) throws IOException { + public static VariantDataset conduit(final VariantDataset recordRdd, + final ADAMContext ac) throws IOException { // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala index 6f963a4b8b..630508bdda 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkContext import org.bdgenomics.adam.projections.AlignmentRecordField._ import org.bdgenomics.adam.projections.Projection import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.utils.cli._ import org.kohsuke.args4j.{ Argument, Option => Args4jOption } @@ -76,7 +76,7 @@ class Reads2Coverage(protected val args: Reads2CoverageArgs) extends BDGSparkCom "Cannot compute coverage for both negative and positive strands separately") // load reads - val readsRdd: AlignmentRecordRDD = sc.loadAlignments(args.inputPath) + val readsRdd: AlignmentRecordDataset = sc.loadAlignments(args.inputPath) val finalReads = if (args.onlyNegativeStrands && !args.onlyPositiveStrands) { readsRdd.transform(rdd => rdd.filter(_.getReadNegativeStrand)) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformAlignments.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformAlignments.scala index c4f83a4f59..39c4bbadbf 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformAlignments.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformAlignments.scala @@ -29,7 +29,7 @@ import org.bdgenomics.adam.models.{ ReferenceRegion, SnpTable } import org.bdgenomics.adam.projections.{ AlignmentRecordField, Filter } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs -import org.bdgenomics.adam.rdd.read.{ AlignmentRecordRDD, QualityScoreBin } +import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, QualityScoreBin } import org.bdgenomics.adam.rich.RichVariant import org.bdgenomics.formats.avro.ProcessingStep import org.bdgenomics.utils.cli._ @@ -156,7 +156,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B * @return If the binQualityScores argument is set, rewrites the quality scores of the * reads into bins. Else, returns the original RDD. */ - private def maybeBin(rdd: AlignmentRecordRDD): AlignmentRecordRDD = { + private def maybeBin(rdd: AlignmentRecordDataset): AlignmentRecordDataset = { Option(args.binQualityScores).fold(rdd)(binDescription => { val bins = QualityScoreBin(binDescription) rdd.binQualityScores(bins) @@ -169,7 +169,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B * to the number of partitions requested by the user. Forces a shuffle using * a hash partitioner. */ - private def maybeRepartition(rdd: AlignmentRecordRDD): AlignmentRecordRDD = { + private def maybeRepartition(rdd: AlignmentRecordDataset): AlignmentRecordDataset = { if (args.repartition != -1) { log.info("Repartitioning reads to to '%d' partitions".format(args.repartition)) rdd.transform(_.repartition(args.repartition)) @@ -184,7 +184,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B * where reads have been marked as duplicates if they appear to be from * duplicated fragments. Else, returns the input RDD. */ - private def maybeDedupe(rdd: AlignmentRecordRDD): AlignmentRecordRDD = { + private def maybeDedupe(rdd: AlignmentRecordDataset): AlignmentRecordDataset = { if (args.markDuplicates) { log.info("Marking duplicates") rdd.markDuplicates() @@ -205,8 +205,8 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B * -realign_indels is not set, we return the input RDD. */ private def maybeRealign(sc: SparkContext, - rdd: AlignmentRecordRDD, - sl: StorageLevel): AlignmentRecordRDD = { + rdd: AlignmentRecordDataset, + sl: StorageLevel): AlignmentRecordDataset = { if (args.locallyRealign) { log.info("Locally realigning indels.") @@ -264,8 +264,8 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B * known variation when recalibrating. If BQSR has not been requested, * we return the input RDD. */ - private def maybeRecalibrate(rdd: AlignmentRecordRDD, - sl: StorageLevel): AlignmentRecordRDD = { + private def maybeRecalibrate(rdd: AlignmentRecordDataset, + sl: StorageLevel): AlignmentRecordDataset = { if (args.recalibrateBaseQualities) { log.info("Recalibrating base qualities") @@ -305,7 +305,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B * smaller than the current number of partitions, or if the -force_shuffle * flag is set. If -coalesce was not requested, returns the input RDD. */ - private def maybeCoalesce(rdd: AlignmentRecordRDD): AlignmentRecordRDD = { + private def maybeCoalesce(rdd: AlignmentRecordDataset): AlignmentRecordDataset = { if (args.coalesce != -1) { log.info("Coalescing the number of partitions to '%d'".format(args.coalesce)) if (args.coalesce > rdd.rdd.partitions.length || args.forceShuffle) { @@ -326,8 +326,8 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B * instead of by contig index. If no sorting was requested, returns * the input RDD. */ - private def maybeSort(rdd: AlignmentRecordRDD, - sl: StorageLevel): AlignmentRecordRDD = { + private def maybeSort(rdd: AlignmentRecordDataset, + sl: StorageLevel): AlignmentRecordDataset = { if (args.sortReads) { // cache the input if requested. sort is two stages: @@ -366,8 +366,8 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B * return the input RDD. */ private def maybeMdTag(sc: SparkContext, - rdd: AlignmentRecordRDD, - stringencyOpt: Option[ValidationStringency]): AlignmentRecordRDD = { + rdd: AlignmentRecordDataset, + stringencyOpt: Option[ValidationStringency]): AlignmentRecordDataset = { if (args.mdTagsReferenceFile != null) { log.info(s"Adding MDTags to reads based on reference file ${args.mdTagsReferenceFile}") val referenceFile = sc.loadReferenceFile(args.mdTagsReferenceFile, @@ -381,7 +381,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B } } - def apply(rdd: AlignmentRecordRDD): AlignmentRecordRDD = { + def apply(rdd: AlignmentRecordDataset): AlignmentRecordDataset = { val sc = rdd.rdd.context val sl = StorageLevel.fromString(args.storageLevel) @@ -459,7 +459,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B ) } - val loadedRdd: AlignmentRecordRDD = + val loadedRdd: AlignmentRecordDataset = if (args.forceLoadBam) { if (args.regionPredicate != null) { val loci = ReferenceRegion.fromString(args.regionPredicate) @@ -510,7 +510,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B } } - val aRdd: AlignmentRecordRDD = if (args.disableProcessingStep) { + val aRdd: AlignmentRecordDataset = if (args.disableProcessingStep) { loadedRdd } else { // add program info @@ -559,7 +559,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B }) // make a new aligned read rdd, that merges the two RDDs together - val newRdd = AlignmentRecordRDD(mergedRdd, mergedSd, mergedRgd, mergedPgs) + val newRdd = AlignmentRecordDataset(mergedRdd, mergedSd, mergedRgd, mergedPgs) // run our transformation val outputRdd = this.apply(newRdd) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFragments.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFragments.scala index 50bbf3ee92..2ace13bd2f 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFragments.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFragments.scala @@ -22,7 +22,7 @@ import org.bdgenomics.adam.io.FastqRecordReader import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs import org.bdgenomics.adam.rdd.read.QualityScoreBin -import org.bdgenomics.adam.rdd.fragment.FragmentRDD +import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.utils.cli._ import org.bdgenomics.utils.misc.Logging import org.kohsuke.args4j.{ Argument, Option => Args4jOption } @@ -74,7 +74,7 @@ class TransformFragments(protected val args: TransformFragmentsArgs) extends BDG * @return If the mark duplicates argument is sent, deduplicates the reads. * Else, returns the input reads. */ - def maybeDedupe(reads: FragmentRDD): FragmentRDD = { + def maybeDedupe(reads: FragmentDataset): FragmentDataset = { if (args.markDuplicates) { reads.markDuplicates() } else { @@ -87,7 +87,7 @@ class TransformFragments(protected val args: TransformFragmentsArgs) extends BDG * @return If the binQualityScores argument is set, rewrites the quality scores of the * reads into bins. Else, returns the original RDD. */ - private def maybeBin(rdd: FragmentRDD): FragmentRDD = { + private def maybeBin(rdd: FragmentDataset): FragmentDataset = { Option(args.binQualityScores).fold(rdd)(binDescription => { val bins = QualityScoreBin(binDescription) rdd.binQualityScores(bins) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala index ee1d4cc1e2..2975eee224 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala @@ -21,7 +21,7 @@ import htsjdk.samtools.{ Cigar, CigarOperator } import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.adam.rdd.read.realignment.IndelRealignmentTarget -import org.bdgenomics.adam.rdd.variant.VariantRDD +import org.bdgenomics.adam.rdd.variant.VariantDataset import org.bdgenomics.adam.rich.RichAlignmentRecord import scala.collection.JavaConversions._ @@ -87,7 +87,7 @@ object ConsensusGenerator { * is 0 bases. * @return A consensus generator that looks at previously called INDELs. */ - def fromKnownIndels(rdd: VariantRDD, + def fromKnownIndels(rdd: VariantDataset, flankSize: Int = 0): ConsensusGenerator = { new ConsensusGeneratorFromKnowns(rdd.rdd, flankSize) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala index e07b48ce62..8fe2b610e9 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala @@ -375,7 +375,7 @@ class AlignmentRecordConverter extends Serializable { * Singleton object to assist with converting AlignmentRecords. * * Singleton object exists due to cross reference from - * org.bdgenomics.adam.rdd.read.AlignmentRecordRDDFunctions. + * org.bdgenomics.adam.rdd.read.AlignmentRecordDatasetFunctions. */ private[adam] object AlignmentRecordConverter extends Serializable { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala b/adam-core/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala index cd7bad909c..3453516695 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala @@ -128,7 +128,7 @@ object Timers extends Metrics { val GrowingTrees = timer("Growing forest of trees") val RunningMapSideJoin = timer("Running map-side join") - // org.bdgenomics.adam.rdd.GenomicRDD + // org.bdgenomics.adam.rdd.GenomicDataset val InnerBroadcastJoin = timer("Inner broadcast region join") val RightOuterBroadcastJoin = timer("Right outer broadcast region join") val BroadcastJoinAndGroupByRight = timer("Broadcast join followed by group-by on right") diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala index 26003a2cf4..e6332b669a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Coverage.scala @@ -71,7 +71,7 @@ private[adam] object Coverage { } /** - * Coverage record for CoverageRDD. + * Coverage record for CoverageDataset. * * Contains Region indexed by contig name, start and end, as well as the average * coverage at each base pair in that region. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala index a677791ef4..cec5a5565c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SAMFileHeaderWritable.scala @@ -19,7 +19,7 @@ package org.bdgenomics.adam.models import htsjdk.samtools.SAMFileHeader import org.bdgenomics.adam.rdd.ADAMContext -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import scala.collection.JavaConversions._ private[adam] object SAMFileHeaderWritable { @@ -70,7 +70,7 @@ private[adam] class SAMFileHeaderWritable private (hdr: SAMFileHeader) extends S // add back optional fields text.foreach(h.setTextHeader) h.setSequenceDictionary(sd.toSAMSequenceDictionary) - pgl.foreach(p => h.addProgramRecord(AlignmentRecordRDD.processingStepToSam(p))) + pgl.foreach(p => h.addProgramRecord(AlignmentRecordDataset.processingStepToSam(p))) comments.foreach(h.addComment) rgs.recordGroups.foreach(rg => h.addReadGroup(rg.toSAMReadGroupRecord)) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala index 2bf5e26130..c33ed6aa56 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SnpTable.scala @@ -22,7 +22,7 @@ import com.esotericsoftware.kryo.{ Kryo, Serializer } import org.apache.spark.rdd.MetricsContext._ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.instrumentation.Timers._ -import org.bdgenomics.adam.rdd.variant.VariantRDD +import org.bdgenomics.adam.rdd.variant.VariantDataset import org.bdgenomics.utils.misc.Logging import scala.annotation.tailrec import scala.math.{ max, min } @@ -143,12 +143,12 @@ object SnpTable { } /** - * Creates a SNP Table from a VariantRDD. + * Creates a SNP Table from a VariantDataset. * * @param variants The variants to populate the table from. * @return Returns a new SNPTable containing the input variants. */ - def apply(variants: VariantRDD): SnpTable = CreatingKnownSnpsTable.time { + def apply(variants: VariantDataset): SnpTable = CreatingKnownSnpsTable.time { val (indices, positions) = CollectingSnps.time { val sortedVariants = variants.sort() .rdd diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala index 9b49904bf9..9eb90c4ca0 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala @@ -53,24 +53,24 @@ import org.bdgenomics.adam.projections.{ Projection } import org.bdgenomics.adam.rdd.contig.{ - DatasetBoundNucleotideContigFragmentRDD, - NucleotideContigFragmentRDD, - ParquetUnboundNucleotideContigFragmentRDD, - RDDBoundNucleotideContigFragmentRDD + DatasetBoundNucleotideContigFragmentDataset, + NucleotideContigFragmentDataset, + ParquetUnboundNucleotideContigFragmentDataset, + RDDBoundNucleotideContigFragmentDataset } import org.bdgenomics.adam.rdd.feature._ import org.bdgenomics.adam.rdd.fragment.{ - DatasetBoundFragmentRDD, - FragmentRDD, - ParquetUnboundFragmentRDD, - RDDBoundFragmentRDD + DatasetBoundFragmentDataset, + FragmentDataset, + ParquetUnboundFragmentDataset, + RDDBoundFragmentDataset } import org.bdgenomics.adam.rdd.read.{ - AlignmentRecordRDD, - DatasetBoundAlignmentRecordRDD, + AlignmentRecordDataset, + DatasetBoundAlignmentRecordDataset, RepairPartitions, - ParquetUnboundAlignmentRecordRDD, - RDDBoundAlignmentRecordRDD + ParquetUnboundAlignmentRecordDataset, + RDDBoundAlignmentRecordDataset } import org.bdgenomics.adam.rdd.variant._ import org.bdgenomics.adam.rich.RichAlignmentRecord @@ -145,899 +145,899 @@ private case class LocatableReferenceRegion(rr: ReferenceRegion) extends Locatab object ADAMContext { // conversion functions for pipes - implicit def contigsToContigsConversionFn(gRdd: NucleotideContigFragmentRDD, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + implicit def contigsToContigsConversionFn(gDataset: NucleotideContigFragmentDataset, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { // hijack the transform function to discard the old RDD - gRdd.transform(oldRdd => rdd) + gDataset.transform(oldRdd => rdd) } implicit def contigsToCoverageConversionFn( - gRdd: NucleotideContigFragmentRDD, - rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: NucleotideContigFragmentDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def contigsToCoverageDatasetConversionFn( - gRdd: NucleotideContigFragmentRDD, - ds: Dataset[Coverage]): CoverageRDD = { - new DatasetBoundCoverageRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: NucleotideContigFragmentDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def contigsToFeaturesConversionFn( - gRdd: NucleotideContigFragmentRDD, - rdd: RDD[Feature]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: NucleotideContigFragmentDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def contigsToFeaturesDatasetConversionFn( - gRdd: NucleotideContigFragmentRDD, - ds: Dataset[FeatureProduct]): FeatureRDD = { - new DatasetBoundFeatureRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: NucleotideContigFragmentDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def contigsToFragmentsConversionFn( - gRdd: NucleotideContigFragmentRDD, - rdd: RDD[Fragment]): FragmentRDD = { - new RDDBoundFragmentRDD(rdd, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def contigsToFragmentsDatasetConversionFn( - gRdd: NucleotideContigFragmentRDD, - ds: Dataset[FragmentProduct]): FragmentRDD = { - new DatasetBoundFragmentRDD(ds, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def contigsToAlignmentRecordsConversionFn( - gRdd: NucleotideContigFragmentRDD, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - new RDDBoundAlignmentRecordRDD(rdd, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def contigsToAlignmentRecordsDatasetConversionFn( - gRdd: NucleotideContigFragmentRDD, - ds: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { - new DatasetBoundAlignmentRecordRDD(ds, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def contigsToGenotypesConversionFn( - gRdd: NucleotideContigFragmentRDD, - rdd: RDD[Genotype]): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines, None) } implicit def contigsToGenotypesDatasetConversionFn( - gRdd: NucleotideContigFragmentRDD, - ds: Dataset[GenotypeProduct]): GenotypeRDD = { - new DatasetBoundGenotypeRDD(ds, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines) } implicit def contigsToVariantsConversionFn( - gRdd: NucleotideContigFragmentRDD, - rdd: RDD[Variant]): VariantRDD = { - new RDDBoundVariantRDD(rdd, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, DefaultHeaderLines.allHeaderLines, None) } implicit def contigsToVariantsDatasetConversionFn( - gRdd: NucleotideContigFragmentRDD, - ds: Dataset[VariantProduct]): VariantRDD = { - new DatasetBoundVariantRDD(ds, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, DefaultHeaderLines.allHeaderLines) } implicit def contigsToVariantContextConversionFn( - gRdd: NucleotideContigFragmentRDD, - rdd: RDD[VariantContext]): VariantContextRDD = { - VariantContextRDD(rdd, - gRdd.sequences, + gDataset: NucleotideContigFragmentDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines) } implicit def coverageToContigsConversionFn( - gRdd: CoverageRDD, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(rdd, gRdd.sequences, None) + gDataset: CoverageDataset, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) } implicit def coverageToContigsDatasetConversionFn( - gRdd: CoverageRDD, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { - new DatasetBoundNucleotideContigFragmentRDD(ds, gRdd.sequences) + gDataset: CoverageDataset, + ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { + new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) } - implicit def coverageToCoverageConversionFn(gRdd: CoverageRDD, - rdd: RDD[Coverage]): CoverageRDD = { + implicit def coverageToCoverageConversionFn(gDataset: CoverageDataset, + rdd: RDD[Coverage]): CoverageDataset = { // hijack the transform function to discard the old RDD - gRdd.transform(oldRdd => rdd) + gDataset.transform(oldRdd => rdd) } implicit def coverageToFeaturesConversionFn( - gRdd: CoverageRDD, - rdd: RDD[Feature]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: CoverageDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, gDataset.samples, None) } implicit def coverageToFeaturesDatasetConversionFn( - gRdd: CoverageRDD, - ds: Dataset[FeatureProduct]): FeatureRDD = { - new DatasetBoundFeatureRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: CoverageDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, gDataset.samples) } implicit def coverageToFragmentsConversionFn( - gRdd: CoverageRDD, - rdd: RDD[Fragment]): FragmentRDD = { - new RDDBoundFragmentRDD(rdd, - gRdd.sequences, + gDataset: CoverageDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def coverageToFragmentsDatasetConversionFn( - gRdd: CoverageRDD, - ds: Dataset[FragmentProduct]): FragmentRDD = { - new DatasetBoundFragmentRDD(ds, - gRdd.sequences, + gDataset: CoverageDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def coverageToAlignmentRecordsConversionFn( - gRdd: CoverageRDD, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - new RDDBoundAlignmentRecordRDD(rdd, - gRdd.sequences, + gDataset: CoverageDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def coverageToAlignmentRecordsDatasetConversionFn( - gRdd: CoverageRDD, - ds: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { - new DatasetBoundAlignmentRecordRDD(ds, - gRdd.sequences, + gDataset: CoverageDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def coverageToGenotypesConversionFn( - gRdd: CoverageRDD, - rdd: RDD[Genotype]): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd, - gRdd.sequences, + gDataset: CoverageDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines, None) } implicit def coverageToGenotypesDatasetConversionFn( - gRdd: CoverageRDD, - ds: Dataset[GenotypeProduct]): GenotypeRDD = { - new DatasetBoundGenotypeRDD(ds, - gRdd.sequences, + gDataset: CoverageDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines) } implicit def coverageToVariantsConversionFn( - gRdd: CoverageRDD, - rdd: RDD[Variant]): VariantRDD = { - new RDDBoundVariantRDD(rdd, - gRdd.sequences, + gDataset: CoverageDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, DefaultHeaderLines.allHeaderLines, None) } implicit def coverageToVariantsDatasetConversionFn( - gRdd: CoverageRDD, - ds: Dataset[VariantProduct]): VariantRDD = { - new DatasetBoundVariantRDD(ds, - gRdd.sequences, + gDataset: CoverageDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, DefaultHeaderLines.allHeaderLines) } implicit def coverageToVariantContextConversionFn( - gRdd: CoverageRDD, - rdd: RDD[VariantContext]): VariantContextRDD = { - VariantContextRDD(rdd, - gRdd.sequences, + gDataset: CoverageDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines) } implicit def featuresToContigsConversionFn( - gRdd: FeatureRDD, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(rdd, gRdd.sequences, None) + gDataset: FeatureDataset, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) } implicit def featuresToContigsDatasetConversionFn( - gRdd: FeatureRDD, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { - new DatasetBoundNucleotideContigFragmentRDD(ds, gRdd.sequences) + gDataset: FeatureDataset, + ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { + new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) } implicit def featuresToCoverageConversionFn( - gRdd: FeatureRDD, - rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: FeatureDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, gDataset.samples, None) } implicit def featuresToCoverageDatasetConversionFn( - gRdd: FeatureRDD, - ds: Dataset[Coverage]): CoverageRDD = { - new DatasetBoundCoverageRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: FeatureDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, gDataset.samples) } - implicit def featuresToFeaturesConversionFn(gRdd: FeatureRDD, - rdd: RDD[Feature]): FeatureRDD = { + implicit def featuresToFeaturesConversionFn(gDataset: FeatureDataset, + rdd: RDD[Feature]): FeatureDataset = { // hijack the transform function to discard the old RDD - gRdd.transform(oldRdd => rdd) + gDataset.transform(oldRdd => rdd) } implicit def featuresToFragmentsConversionFn( - gRdd: FeatureRDD, - rdd: RDD[Fragment]): FragmentRDD = { - new RDDBoundFragmentRDD(rdd, - gRdd.sequences, + gDataset: FeatureDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def featuresToFragmentsDatasetConversionFn( - gRdd: FeatureRDD, - ds: Dataset[FragmentProduct]): FragmentRDD = { - new DatasetBoundFragmentRDD(ds, - gRdd.sequences, + gDataset: FeatureDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def featuresToAlignmentRecordsConversionFn( - gRdd: FeatureRDD, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - new RDDBoundAlignmentRecordRDD(rdd, - gRdd.sequences, + gDataset: FeatureDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def featuresToAlignmentRecordsDatasetConversionFn( - gRdd: FeatureRDD, - ds: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { - new DatasetBoundAlignmentRecordRDD(ds, - gRdd.sequences, + gDataset: FeatureDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def featuresToGenotypesConversionFn( - gRdd: FeatureRDD, - rdd: RDD[Genotype]): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd, - gRdd.sequences, + gDataset: FeatureDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines, None) } implicit def featuresToGenotypesDatasetConversionFn( - gRdd: FeatureRDD, - ds: Dataset[GenotypeProduct]): GenotypeRDD = { - new DatasetBoundGenotypeRDD(ds, - gRdd.sequences, + gDataset: FeatureDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines) } implicit def featuresToVariantsConversionFn( - gRdd: FeatureRDD, - rdd: RDD[Variant]): VariantRDD = { - new RDDBoundVariantRDD(rdd, - gRdd.sequences, + gDataset: FeatureDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, DefaultHeaderLines.allHeaderLines, None) } implicit def featuresToVariantsDatasetConversionFn( - gRdd: FeatureRDD, - ds: Dataset[VariantProduct]): VariantRDD = { - new DatasetBoundVariantRDD(ds, - gRdd.sequences, + gDataset: FeatureDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, DefaultHeaderLines.allHeaderLines) } implicit def featuresToVariantContextConversionFn( - gRdd: FeatureRDD, - rdd: RDD[VariantContext]): VariantContextRDD = { - VariantContextRDD(rdd, - gRdd.sequences, + gDataset: FeatureDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines) } implicit def fragmentsToContigsConversionFn( - gRdd: FragmentRDD, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(rdd, gRdd.sequences, None) + gDataset: FragmentDataset, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) } implicit def fragmentsToContigsDatasetConversionFn( - gRdd: FragmentRDD, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { - new DatasetBoundNucleotideContigFragmentRDD(ds, gRdd.sequences) + gDataset: FragmentDataset, + ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { + new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) } implicit def fragmentsToCoverageConversionFn( - gRdd: FragmentRDD, - rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: FragmentDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def fragmentsToCoverageDatasetConversionFn( - gRdd: FragmentRDD, - ds: Dataset[Coverage]): CoverageRDD = { - new DatasetBoundCoverageRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: FragmentDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def fragmentsToFeaturesConversionFn( - gRdd: FragmentRDD, - rdd: RDD[Feature]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: FragmentDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def fragmentsToFeaturesDatasetConversionFn( - gRdd: FragmentRDD, - ds: Dataset[FeatureProduct]): FeatureRDD = { - new DatasetBoundFeatureRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: FragmentDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty[Sample]) } - implicit def fragmentsToFragmentsConversionFn(gRdd: FragmentRDD, - rdd: RDD[Fragment]): FragmentRDD = { + implicit def fragmentsToFragmentsConversionFn(gDataset: FragmentDataset, + rdd: RDD[Fragment]): FragmentDataset = { // hijack the transform function to discard the old RDD - gRdd.transform(oldRdd => rdd) + gDataset.transform(oldRdd => rdd) } implicit def fragmentsToAlignmentRecordsConversionFn( - gRdd: FragmentRDD, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - new RDDBoundAlignmentRecordRDD(rdd, - gRdd.sequences, - gRdd.recordGroups, - gRdd.processingSteps, + gDataset: FragmentDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, + gDataset.recordGroups, + gDataset.processingSteps, None) } implicit def fragmentsToAlignmentRecordsDatasetConversionFn( - gRdd: FragmentRDD, - ds: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { - new DatasetBoundAlignmentRecordRDD(ds, - gRdd.sequences, - gRdd.recordGroups, - gRdd.processingSteps) + gDataset: FragmentDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, + gDataset.recordGroups, + gDataset.processingSteps) } implicit def fragmentsToGenotypesConversionFn( - gRdd: FragmentRDD, - rdd: RDD[Genotype]): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd, - gRdd.sequences, - gRdd.recordGroups.toSamples, + gDataset: FragmentDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, + gDataset.recordGroups.toSamples, DefaultHeaderLines.allHeaderLines, None) } implicit def fragmentsToGenotypesDatasetConversionFn( - gRdd: FragmentRDD, - ds: Dataset[GenotypeProduct]): GenotypeRDD = { - new DatasetBoundGenotypeRDD(ds, - gRdd.sequences, - gRdd.recordGroups.toSamples, + gDataset: FragmentDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, + gDataset.recordGroups.toSamples, DefaultHeaderLines.allHeaderLines) } implicit def fragmentsToVariantsConversionFn( - gRdd: FragmentRDD, - rdd: RDD[Variant]): VariantRDD = { - new RDDBoundVariantRDD(rdd, - gRdd.sequences, + gDataset: FragmentDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, DefaultHeaderLines.allHeaderLines, None) } implicit def fragmentsToVariantsDatasetConversionFn( - gRdd: FragmentRDD, - ds: Dataset[VariantProduct]): VariantRDD = { - new DatasetBoundVariantRDD(ds, - gRdd.sequences, + gDataset: FragmentDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, DefaultHeaderLines.allHeaderLines) } implicit def fragmentsToVariantContextConversionFn( - gRdd: FragmentRDD, - rdd: RDD[VariantContext]): VariantContextRDD = { - VariantContextRDD(rdd, - gRdd.sequences, - gRdd.recordGroups.toSamples, + gDataset: FragmentDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, + gDataset.recordGroups.toSamples, DefaultHeaderLines.allHeaderLines) } implicit def genericToContigsConversionFn[Y <: GenericGenomicDataset[_, _]]( - gRdd: Y, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(rdd, gRdd.sequences, None) + gDataset: Y, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) } implicit def genericToCoverageConversionFn[Y <: GenericGenomicDataset[_, _]]( - gRdd: Y, - rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: Y, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def genericToFeatureConversionFn[Y <: GenericGenomicDataset[_, _]]( - gRdd: Y, - rdd: RDD[Feature]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: Y, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def genericToFragmentsConversionFn[Y <: GenericGenomicDataset[_, _]]( - gRdd: Y, - rdd: RDD[Fragment]): FragmentRDD = { - new RDDBoundFragmentRDD(rdd, - gRdd.sequences, + gDataset: Y, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def genericToAlignmentRecordsConversionFn[Y <: GenericGenomicDataset[_, _]]( - gRdd: Y, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - new RDDBoundAlignmentRecordRDD(rdd, - gRdd.sequences, + gDataset: Y, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def genericToGenotypesConversionFn[Y <: GenericGenomicDataset[_, _]]( - gRdd: Y, - rdd: RDD[Genotype]): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd, - gRdd.sequences, + gDataset: Y, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines, None) } implicit def genericToVariantsConversionFn[Y <: GenericGenomicDataset[_, _]]( - gRdd: Y, - rdd: RDD[Variant]): VariantRDD = { - new RDDBoundVariantRDD(rdd, - gRdd.sequences, + gDataset: Y, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, DefaultHeaderLines.allHeaderLines, None) } implicit def genericToVariantContextsConversionFn[Y <: GenericGenomicDataset[_, _]]( - gRdd: Y, - rdd: RDD[VariantContext]): VariantContextRDD = { - new RDDBoundVariantContextRDD(rdd, - gRdd.sequences, + gDataset: Y, + rdd: RDD[VariantContext]): VariantContextDataset = { + new RDDBoundVariantContextDataset(rdd, + gDataset.sequences, Seq.empty, DefaultHeaderLines.allHeaderLines, None) } implicit def alignmentRecordsToContigsConversionFn( - gRdd: AlignmentRecordRDD, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(rdd, gRdd.sequences, None) + gDataset: AlignmentRecordDataset, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) } implicit def alignmentRecordsToContigsDatasetConversionFn( - gRdd: AlignmentRecordRDD, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { - new DatasetBoundNucleotideContigFragmentRDD(ds, gRdd.sequences) + gDataset: AlignmentRecordDataset, + ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { + new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) } implicit def alignmentRecordsToCoverageConversionFn( - gRdd: AlignmentRecordRDD, - rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: AlignmentRecordDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def alignmentRecordsToCoverageDatasetConversionFn( - gRdd: AlignmentRecordRDD, - ds: Dataset[Coverage]): CoverageRDD = { - new DatasetBoundCoverageRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: AlignmentRecordDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def alignmentRecordsToFeaturesConversionFn( - gRdd: AlignmentRecordRDD, - rdd: RDD[Feature]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: AlignmentRecordDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def alignmentRecordsToFeaturesDatasetConversionFn( - gRdd: AlignmentRecordRDD, - ds: Dataset[FeatureProduct]): FeatureRDD = { - new DatasetBoundFeatureRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: AlignmentRecordDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def alignmentRecordsToFragmentsConversionFn( - gRdd: AlignmentRecordRDD, - rdd: RDD[Fragment]): FragmentRDD = { - new RDDBoundFragmentRDD(rdd, - gRdd.sequences, - gRdd.recordGroups, - gRdd.processingSteps, + gDataset: AlignmentRecordDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, + gDataset.recordGroups, + gDataset.processingSteps, None) } implicit def alignmentRecordsToFragmentsDatasetConversionFn( - gRdd: AlignmentRecordRDD, - ds: Dataset[FragmentProduct]): FragmentRDD = { - new DatasetBoundFragmentRDD(ds, - gRdd.sequences, - gRdd.recordGroups, - gRdd.processingSteps) + gDataset: AlignmentRecordDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, + gDataset.recordGroups, + gDataset.processingSteps) } - implicit def alignmentRecordsToAlignmentRecordsConversionFn(gRdd: AlignmentRecordRDD, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { + implicit def alignmentRecordsToAlignmentRecordsConversionFn(gDataset: AlignmentRecordDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { // hijack the transform function to discard the old RDD - gRdd.transform(oldRdd => rdd) + gDataset.transform(oldRdd => rdd) } implicit def alignmentRecordsToGenotypesConversionFn( - gRdd: AlignmentRecordRDD, - rdd: RDD[Genotype]): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd, - gRdd.sequences, - gRdd.recordGroups.toSamples, + gDataset: AlignmentRecordDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, + gDataset.recordGroups.toSamples, DefaultHeaderLines.allHeaderLines, None) } implicit def alignmentRecordsToGenotypesDatasetConversionFn( - gRdd: AlignmentRecordRDD, - ds: Dataset[GenotypeProduct]): GenotypeRDD = { - new DatasetBoundGenotypeRDD(ds, - gRdd.sequences, - gRdd.recordGroups.toSamples, + gDataset: AlignmentRecordDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, + gDataset.recordGroups.toSamples, DefaultHeaderLines.allHeaderLines) } implicit def alignmentRecordsToVariantsConversionFn( - gRdd: AlignmentRecordRDD, - rdd: RDD[Variant]): VariantRDD = { - new RDDBoundVariantRDD(rdd, - gRdd.sequences, + gDataset: AlignmentRecordDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, DefaultHeaderLines.allHeaderLines, None) } implicit def alignmentRecordsToVariantsDatasetConversionFn( - gRdd: AlignmentRecordRDD, - ds: Dataset[VariantProduct]): VariantRDD = { - new DatasetBoundVariantRDD(ds, - gRdd.sequences, + gDataset: AlignmentRecordDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, DefaultHeaderLines.allHeaderLines) } implicit def alignmentRecordsToVariantContextConversionFn( - gRdd: AlignmentRecordRDD, - rdd: RDD[VariantContext]): VariantContextRDD = { - VariantContextRDD(rdd, - gRdd.sequences, - gRdd.recordGroups.toSamples, + gDataset: AlignmentRecordDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, + gDataset.recordGroups.toSamples, DefaultHeaderLines.allHeaderLines) } implicit def genotypesToContigsConversionFn( - gRdd: GenotypeRDD, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(rdd, gRdd.sequences, None) + gDataset: GenotypeDataset, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) } implicit def genotypesToContigsDatasetConversionFn( - gRdd: GenotypeRDD, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { - new DatasetBoundNucleotideContigFragmentRDD(ds, gRdd.sequences) + gDataset: GenotypeDataset, + ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { + new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) } implicit def genotypesToCoverageConversionFn( - gRdd: GenotypeRDD, - rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: GenotypeDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def genotypesToCoverageDatasetConversionFn( - gRdd: GenotypeRDD, - ds: Dataset[Coverage]): CoverageRDD = { - new DatasetBoundCoverageRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: GenotypeDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def genotypesToFeaturesConversionFn( - gRdd: GenotypeRDD, - rdd: RDD[Feature]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: GenotypeDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def genotypesToFeaturesDatasetConversionFn( - gRdd: GenotypeRDD, - ds: Dataset[FeatureProduct]): FeatureRDD = { - new DatasetBoundFeatureRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: GenotypeDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def genotypesToFragmentsConversionFn( - gRdd: GenotypeRDD, - rdd: RDD[Fragment]): FragmentRDD = { - new RDDBoundFragmentRDD(rdd, - gRdd.sequences, + gDataset: GenotypeDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def genotypesToFragmentsDatasetConversionFn( - gRdd: GenotypeRDD, - ds: Dataset[FragmentProduct]): FragmentRDD = { - new DatasetBoundFragmentRDD(ds, - gRdd.sequences, + gDataset: GenotypeDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def genotypesToAlignmentRecordsConversionFn( - gRdd: GenotypeRDD, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - new RDDBoundAlignmentRecordRDD(rdd, - gRdd.sequences, + gDataset: GenotypeDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def genotypesToAlignmentRecordsDatasetConversionFn( - gRdd: GenotypeRDD, - ds: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { - new DatasetBoundAlignmentRecordRDD(ds, - gRdd.sequences, + gDataset: GenotypeDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } - implicit def genotypesToGenotypesConversionFn(gRdd: GenotypeRDD, - rdd: RDD[Genotype]): GenotypeRDD = { + implicit def genotypesToGenotypesConversionFn(gDataset: GenotypeDataset, + rdd: RDD[Genotype]): GenotypeDataset = { // hijack the transform function to discard the old RDD - gRdd.transform(oldRdd => rdd) + gDataset.transform(oldRdd => rdd) } implicit def genotypesToVariantsConversionFn( - gRdd: GenotypeRDD, - rdd: RDD[Variant]): VariantRDD = { - new RDDBoundVariantRDD(rdd, - gRdd.sequences, - gRdd.headerLines, + gDataset: GenotypeDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, + gDataset.headerLines, None) } implicit def genotypesToVariantsDatasetConversionFn( - gRdd: GenotypeRDD, - ds: Dataset[VariantProduct]): VariantRDD = { - new DatasetBoundVariantRDD(ds, - gRdd.sequences, - gRdd.headerLines) + gDataset: GenotypeDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, + gDataset.headerLines) } implicit def genotypesToVariantContextConversionFn( - gRdd: GenotypeRDD, - rdd: RDD[VariantContext]): VariantContextRDD = { - VariantContextRDD(rdd, - gRdd.sequences, - gRdd.samples, - gRdd.headerLines) + gDataset: GenotypeDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, + gDataset.samples, + gDataset.headerLines) } implicit def variantsToContigsConversionFn( - gRdd: VariantRDD, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(rdd, gRdd.sequences, None) + gDataset: VariantDataset, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) } implicit def variantsToContigsDatasetConversionFn( - gRdd: VariantRDD, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { - new DatasetBoundNucleotideContigFragmentRDD(ds, gRdd.sequences) + gDataset: VariantDataset, + ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { + new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) } implicit def variantsToCoverageConversionFn( - gRdd: VariantRDD, - rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: VariantDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def variantsToCoverageDatasetConversionFn( - gRdd: VariantRDD, - ds: Dataset[Coverage]): CoverageRDD = { - new DatasetBoundCoverageRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: VariantDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def variantsToFeaturesConversionFn( - gRdd: VariantRDD, - rdd: RDD[Feature]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: VariantDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def variantsToFeaturesDatasetConversionFn( - gRdd: VariantRDD, - ds: Dataset[FeatureProduct]): FeatureRDD = { - new DatasetBoundFeatureRDD(ds, gRdd.sequences, Seq.empty[Sample]) + gDataset: VariantDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty[Sample]) } implicit def variantsToFragmentsConversionFn( - gRdd: VariantRDD, - rdd: RDD[Fragment]): FragmentRDD = { - new RDDBoundFragmentRDD(rdd, - gRdd.sequences, + gDataset: VariantDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def variantsToFragmentsDatasetConversionFn( - gRdd: VariantRDD, - ds: Dataset[FragmentProduct]): FragmentRDD = { - new DatasetBoundFragmentRDD(ds, - gRdd.sequences, + gDataset: VariantDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def variantsToAlignmentRecordsConversionFn( - gRdd: VariantRDD, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - new RDDBoundAlignmentRecordRDD(rdd, - gRdd.sequences, + gDataset: VariantDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def variantsToAlignmentRecordsDatasetConversionFn( - gRdd: VariantRDD, - ds: Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { - new DatasetBoundAlignmentRecordRDD(ds, - gRdd.sequences, + gDataset: VariantDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty) } implicit def variantsToGenotypesConversionFn( - gRdd: VariantRDD, - rdd: RDD[Genotype]): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd, - gRdd.sequences, + gDataset: VariantDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, Seq.empty, - gRdd.headerLines, + gDataset.headerLines, None) } implicit def variantsToGenotypesDatasetConversionFn( - gRdd: VariantRDD, - ds: Dataset[GenotypeProduct]): GenotypeRDD = { - new DatasetBoundGenotypeRDD(ds, - gRdd.sequences, + gDataset: VariantDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, Seq.empty, - gRdd.headerLines) + gDataset.headerLines) } - implicit def variantsToVariantsConversionFn(gRdd: VariantRDD, - rdd: RDD[Variant]): VariantRDD = { + implicit def variantsToVariantsConversionFn(gDataset: VariantDataset, + rdd: RDD[Variant]): VariantDataset = { // hijack the transform function to discard the old RDD - gRdd.transform(oldRdd => rdd) + gDataset.transform(oldRdd => rdd) } implicit def variantsToVariantContextConversionFn( - gRdd: VariantRDD, - rdd: RDD[VariantContext]): VariantContextRDD = { - VariantContextRDD(rdd, - gRdd.sequences, + gDataset: VariantDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, Seq.empty, - gRdd.headerLines) + gDataset.headerLines) } implicit def variantContextsToContigsConversionFn( - gRdd: VariantContextRDD, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(rdd, gRdd.sequences, None) + gDataset: VariantContextDataset, + rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) } implicit def variantContextsToCoverageConversionFn( - gRdd: VariantContextRDD, - rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: VariantContextDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def variantContextsToFeaturesConversionFn( - gRdd: VariantContextRDD, - rdd: RDD[Feature]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, gRdd.sequences, Seq.empty[Sample], None) + gDataset: VariantContextDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def variantContextsToFragmentsConversionFn( - gRdd: VariantContextRDD, - rdd: RDD[Fragment]): FragmentRDD = { - new RDDBoundFragmentRDD(rdd, - gRdd.sequences, + gDataset: VariantContextDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def variantContextsToAlignmentRecordsConversionFn( - gRdd: VariantContextRDD, - rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - new RDDBoundAlignmentRecordRDD(rdd, - gRdd.sequences, + gDataset: VariantContextDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, RecordGroupDictionary.empty, Seq.empty, None) } implicit def variantContextsToGenotypesConversionFn( - gRdd: VariantContextRDD, - rdd: RDD[Genotype]): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd, - gRdd.sequences, - gRdd.samples, - gRdd.headerLines, + gDataset: VariantContextDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, + gDataset.samples, + gDataset.headerLines, None) } implicit def variantContextsToVariantsConversionFn( - gRdd: VariantContextRDD, - rdd: RDD[Variant]): VariantRDD = { - new RDDBoundVariantRDD(rdd, - gRdd.sequences, - gRdd.headerLines, + gDataset: VariantContextDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, + gDataset.headerLines, None) } - implicit def variantContextsToVariantContextsConversionFn(gRdd: VariantContextRDD, - rdd: RDD[VariantContext]): VariantContextRDD = { + implicit def variantContextsToVariantContextsConversionFn(gDataset: VariantContextDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { // hijack the transform function to discard the old RDD - gRdd.transform(oldRdd => rdd) + gDataset.transform(oldRdd => rdd) } // Add ADAM Spark context methods @@ -1503,7 +1503,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load alignment records from BAM/CRAM/SAM into an AlignmentRecordRDD. + * Load alignment records from BAM/CRAM/SAM into an AlignmentRecordDataset. * * This reads the sequence and record group dictionaries from the BAM/CRAM/SAM file * header. SAMRecords are read from the file and converted to the @@ -1513,13 +1513,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Globs/directories are supported. * @param stringency The validation stringency to use when validating the * BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT. - * @return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, + * @return Returns an AlignmentRecordDataset which wraps the genomic dataset of alignment records, * sequence dictionary representing contigs the alignment records may be aligned to, * and the record group dictionary for the alignment records if one is available. */ def loadBam( pathName: String, - stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD = LoadBam.time { + stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordDataset = LoadBam.time { val path = new Path(pathName) val bamFiles = getFsAndFiles(path) @@ -1582,7 +1582,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log if (Metrics.isRecording) records.instrument() else records val samRecordConverter = new SAMRecordConverter - AlignmentRecordRDD(records.map(p => samRecordConverter.convert(p._2.get)), + AlignmentRecordDataset(records.map(p => samRecordConverter.convert(p._2.get)), seqDict, readGroups, programs) @@ -1595,14 +1595,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param pathName The path name to load indexed BAM formatted alignment records from. * Globs/directories are supported. * @param viewRegion The ReferenceRegion we are filtering on. - * @return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, + * @return Returns an AlignmentRecordDataset which wraps the genomic dataset of alignment records, * sequence dictionary representing contigs the alignment records may be aligned to, * and the record group dictionary for the alignment records if one is available. */ // todo: add stringency with default if possible def loadIndexedBam( pathName: String, - viewRegion: ReferenceRegion): AlignmentRecordRDD = { + viewRegion: ReferenceRegion): AlignmentRecordDataset = { loadIndexedBam(pathName, Iterable(viewRegion)) } @@ -1615,14 +1615,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param viewRegions Iterable of ReferenceRegion we are filtering on. * @param stringency The validation stringency to use when validating the * BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT. - * @return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, + * @return Returns an AlignmentRecordDataset which wraps the genomic dataset of alignment records, * sequence dictionary representing contigs the alignment records may be aligned to, * and the record group dictionary for the alignment records if one is available. */ def loadIndexedBam( pathName: String, viewRegions: Iterable[ReferenceRegion], - stringency: ValidationStringency = ValidationStringency.STRICT)(implicit s: DummyImplicit): AlignmentRecordRDD = LoadIndexedBam.time { + stringency: ValidationStringency = ValidationStringency.STRICT)(implicit s: DummyImplicit): AlignmentRecordDataset = LoadIndexedBam.time { val path = new Path(pathName) @@ -1696,7 +1696,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log if (Metrics.isRecording) records.instrument() else records val samRecordConverter = new SAMRecordConverter - AlignmentRecordRDD(records.map(p => samRecordConverter.convert(p._2.get)), + AlignmentRecordDataset(records.map(p => samRecordConverter.convert(p._2.get)), seqDict, readGroups, programs) @@ -1864,7 +1864,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name in Parquet + Avro format into an AlignmentRecordRDD. + * Load a path name in Parquet + Avro format into an AlignmentRecordDataset. * * @note The sequence dictionary is read from an Avro file stored at * pathName/_seqdict.avro and the record group dictionary is read from an @@ -1877,14 +1877,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param optProjection An option projection schema to use when reading Parquet + Avro. * Defaults to None. - * @return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, + * @return Returns an AlignmentRecordDataset which wraps the genomic dataset of alignment records, * sequence dictionary representing contigs the alignment records may be aligned to, * and the record group dictionary for the alignment records if one is available. */ def loadParquetAlignments( pathName: String, optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): AlignmentRecordRDD = { + optProjection: Option[Schema] = None): AlignmentRecordDataset = { // convert avro to sequence dictionary val sd = loadAvroSequenceDictionary(pathName) @@ -1897,20 +1897,20 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log (optPredicate, optProjection) match { case (None, None) => { - ParquetUnboundAlignmentRecordRDD(sc, pathName, sd, rgd, pgs) + ParquetUnboundAlignmentRecordDataset(sc, pathName, sd, rgd, pgs) } case (_, _) => { // load from disk val rdd = loadParquet[AlignmentRecord](pathName, optPredicate, optProjection) - RDDBoundAlignmentRecordRDD(rdd, sd, rgd, pgs, + RDDBoundAlignmentRecordDataset(rdd, sd, rgd, pgs, optPartitionMap = extractPartitionMap(pathName)) } } } /** - * Load a path name with range binned partitioned Parquet format into an AlignmentRecordRDD. + * Load a path name with range binned partitioned Parquet format into an AlignmentRecordDataset. * * @note The sequence dictionary is read from an Avro file stored at * pathName/_seqdict.avro and the record group dictionary is read from an @@ -1923,15 +1923,15 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param optLookbackPartitions Number of partitions to lookback to find beginning of an overlapping * region when using the filterByOverlappingRegions function on the returned dataset. * Defaults to one partition. - * @return Returns an AlignmentRecordRDD. + * @return Returns an AlignmentRecordDataset. */ def loadPartitionedParquetAlignments(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, - optLookbackPartitions: Option[Int] = Some(1)): AlignmentRecordRDD = { + optLookbackPartitions: Option[Int] = Some(1)): AlignmentRecordDataset = { val partitionBinSize = getPartitionBinSize(pathName) val reads = loadParquetAlignments(pathName) - val alignmentsDatasetBound = DatasetBoundAlignmentRecordRDD(reads.dataset, + val alignmentsDatasetBound = DatasetBoundAlignmentRecordDataset(reads.dataset, reads.sequences, reads.recordGroups, reads.processingSteps, @@ -1944,7 +1944,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load unaligned alignment records from interleaved FASTQ into an AlignmentRecordRDD. + * Load unaligned alignment records from interleaved FASTQ into an AlignmentRecordDataset. * * In interleaved FASTQ, the two reads from a paired sequencing protocol are * interleaved in a single file. This is a zipped representation of the @@ -1952,10 +1952,10 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * * @param pathName The path name to load unaligned alignment records from. * Globs/directories are supported. - * @return Returns an unaligned AlignmentRecordRDD. + * @return Returns an unaligned AlignmentRecordDataset. */ def loadInterleavedFastq( - pathName: String): AlignmentRecordRDD = LoadInterleavedFastq.time { + pathName: String): AlignmentRecordDataset = LoadInterleavedFastq.time { val job = HadoopUtil.newJob(sc) val conf = ContextUtil.getConfiguration(job) @@ -1973,11 +1973,11 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log // convert records val fastqRecordConverter = new FastqRecordConverter - AlignmentRecordRDD.unaligned(records.flatMap(fastqRecordConverter.convertPair)) + AlignmentRecordDataset.unaligned(records.flatMap(fastqRecordConverter.convertPair)) } /** - * Load unaligned alignment records from (possibly paired) FASTQ into an AlignmentRecordRDD. + * Load unaligned alignment records from (possibly paired) FASTQ into an AlignmentRecordDataset. * * @see loadPairedFastq * @see loadUnpairedFastq @@ -1990,13 +1990,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * records. Defaults to None. * @param stringency The validation stringency to use when validating (possibly paired) FASTQ format. * Defaults to ValidationStringency.STRICT. - * @return Returns an unaligned AlignmentRecordRDD. + * @return Returns an unaligned AlignmentRecordDataset. */ def loadFastq( pathName1: String, optPathName2: Option[String], optRecordGroup: Option[String] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD = LoadFastq.time { + stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordDataset = LoadFastq.time { optPathName2.fold({ loadUnpairedFastq(pathName1, @@ -2011,7 +2011,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load unaligned alignment records from paired FASTQ into an AlignmentRecordRDD. + * Load unaligned alignment records from paired FASTQ into an AlignmentRecordDataset. * * @param pathName1 The path name to load the first set of unaligned alignment records from. * Globs/directories are supported. @@ -2024,14 +2024,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * validation. Defaults to StorageLevel.MEMORY_ONLY. * @param stringency The validation stringency to use when validating paired FASTQ format. * Defaults to ValidationStringency.STRICT. - * @return Returns an unaligned AlignmentRecordRDD. + * @return Returns an unaligned AlignmentRecordDataset. */ def loadPairedFastq( pathName1: String, pathName2: String, optRecordGroup: Option[String] = None, persistLevel: Option[StorageLevel] = Some(StorageLevel.MEMORY_ONLY), - stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD = LoadPairedFastq.time { + stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordDataset = LoadPairedFastq.time { val reads1 = loadUnpairedFastq( pathName1, @@ -2063,11 +2063,11 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log case ValidationStringency.SILENT => } - AlignmentRecordRDD.unaligned(reads1.rdd ++ reads2.rdd) + AlignmentRecordDataset.unaligned(reads1.rdd ++ reads2.rdd) } /** - * Load unaligned alignment records from unpaired FASTQ into an AlignmentRecordRDD. + * Load unaligned alignment records from unpaired FASTQ into an AlignmentRecordDataset. * * @param pathName The path name to load unaligned alignment records from. * Globs/directories are supported. @@ -2079,14 +2079,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * records. Defaults to None. * @param stringency The validation stringency to use when validating unpaired FASTQ format. * Defaults to ValidationStringency.STRICT. - * @return Returns an unaligned AlignmentRecordRDD. + * @return Returns an unaligned AlignmentRecordDataset. */ def loadUnpairedFastq( pathName: String, setFirstOfPair: Boolean = false, setSecondOfPair: Boolean = false, optRecordGroup: Option[String] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD = LoadUnpairedFastq.time { + stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordDataset = LoadUnpairedFastq.time { val job = HadoopUtil.newJob(sc) val conf = ContextUtil.getConfiguration(job) @@ -2105,7 +2105,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log // convert records val fastqRecordConverter = new FastqRecordConverter - AlignmentRecordRDD.unaligned(records.map( + AlignmentRecordDataset.unaligned(records.map( fastqRecordConverter.convertRead( _, optRecordGroup.map(recordGroup => @@ -2150,17 +2150,17 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load variant context records from VCF into a VariantContextRDD. + * Load variant context records from VCF into a VariantContextDataset. * * @param pathName The path name to load VCF variant context records from. * Globs/directories are supported. * @param stringency The validation stringency to use when validating VCF format. * Defaults to ValidationStringency.STRICT. - * @return Returns a VariantContextRDD. + * @return Returns a VariantContextDataset. */ def loadVcf( pathName: String, - stringency: ValidationStringency = ValidationStringency.STRICT): VariantContextRDD = LoadVcf.time { + stringency: ValidationStringency = ValidationStringency.STRICT): VariantContextDataset = LoadVcf.time { // load records from VCF val records = readVcfRecords(pathName, None) @@ -2172,14 +2172,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log val (sd, samples, headers) = loadVcfMetadata(pathName) val vcc = VariantContextConverter(headers, stringency, sc.hadoopConfiguration) - VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)), + VariantContextDataset(records.flatMap(p => vcc.convert(p._2.get)), sd, samples, VariantContextConverter.cleanAndMixInSupportedLines(headers, stringency, log)) } /** - * Load variant context records from VCF into a VariantContextRDD. + * Load variant context records from VCF into a VariantContextDataset. * * Only converts the core Genotype/Variant fields, and the fields set in the * requested projection. Core variant fields include: @@ -2205,13 +2205,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * fields listed above. * @param stringency The validation stringency to use when validating VCF format. * Defaults to ValidationStringency.STRICT. - * @return Returns a VariantContextRDD. + * @return Returns a VariantContextDataset. */ def loadVcfWithProjection( pathName: String, infoFields: Set[String], formatFields: Set[String], - stringency: ValidationStringency = ValidationStringency.STRICT): VariantContextRDD = LoadVcf.time { + stringency: ValidationStringency = ValidationStringency.STRICT): VariantContextDataset = LoadVcf.time { // load records from VCF val records = readVcfRecords(pathName, None) @@ -2239,41 +2239,41 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } case _ => None }), stringency, sc.hadoopConfiguration) - VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)), + VariantContextDataset(records.flatMap(p => vcc.convert(p._2.get)), sd, samples, VariantContextConverter.cleanAndMixInSupportedLines(headers, stringency, log)) } /** - * Load variant context records from VCF indexed by tabix (tbi) into a VariantContextRDD. + * Load variant context records from VCF indexed by tabix (tbi) into a VariantContextDataset. * * @param pathName The path name to load VCF variant context records from. * Globs/directories are supported. * @param viewRegion ReferenceRegion we are filtering on. - * @return Returns a VariantContextRDD. + * @return Returns a VariantContextDataset. */ // todo: add stringency with default if possible def loadIndexedVcf( pathName: String, - viewRegion: ReferenceRegion): VariantContextRDD = { + viewRegion: ReferenceRegion): VariantContextDataset = { loadIndexedVcf(pathName, Iterable(viewRegion)) } /** - * Load variant context records from VCF indexed by tabix (tbi) into a VariantContextRDD. + * Load variant context records from VCF indexed by tabix (tbi) into a VariantContextDataset. * * @param pathName The path name to load VCF variant context records from. * Globs/directories are supported. * @param viewRegions Iterator of ReferenceRegions we are filtering on. * @param stringency The validation stringency to use when validating VCF format. * Defaults to ValidationStringency.STRICT. - * @return Returns a VariantContextRDD. + * @return Returns a VariantContextDataset. */ def loadIndexedVcf( pathName: String, viewRegions: Iterable[ReferenceRegion], - stringency: ValidationStringency = ValidationStringency.STRICT)(implicit s: DummyImplicit): VariantContextRDD = LoadIndexedVcf.time { + stringency: ValidationStringency = ValidationStringency.STRICT)(implicit s: DummyImplicit): VariantContextDataset = LoadIndexedVcf.time { // load records from VCF val records = readVcfRecords(pathName, Some(viewRegions)) @@ -2285,14 +2285,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log val (sd, samples, headers) = loadVcfMetadata(pathName) val vcc = VariantContextConverter(headers, stringency, sc.hadoopConfiguration) - VariantContextRDD(records.flatMap(p => vcc.convert(p._2.get)), + VariantContextDataset(records.flatMap(p => vcc.convert(p._2.get)), sd, samples, VariantContextConverter.cleanAndMixInSupportedLines(headers, stringency, log)) } /** - * Load a path name in Parquet + Avro format into a GenotypeRDD. + * Load a path name in Parquet + Avro format into a GenotypeDataset. * * @param pathName The path name to load genotypes from. * Globs/directories are supported. @@ -2300,12 +2300,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param optProjection An option projection schema to use when reading Parquet + Avro. * Defaults to None. - * @return Returns a GenotypeRDD. + * @return Returns a GenotypeDataset. */ def loadParquetGenotypes( pathName: String, optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): GenotypeRDD = { + optProjection: Option[Schema] = None): GenotypeDataset = { // load header lines val headers = loadHeaderLines(pathName) @@ -2318,20 +2318,20 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log (optPredicate, optProjection) match { case (None, None) => { - ParquetUnboundGenotypeRDD(sc, pathName, sd, samples, headers) + ParquetUnboundGenotypeDataset(sc, pathName, sd, samples, headers) } case (_, _) => { // load from disk val rdd = loadParquet[Genotype](pathName, optPredicate, optProjection) - new RDDBoundGenotypeRDD(rdd, sd, samples, headers, + new RDDBoundGenotypeDataset(rdd, sd, samples, headers, optPartitionMap = extractPartitionMap(pathName)) } } } /** - * Load a path name with range binned partitioned Parquet format into a GenotypeRDD. + * Load a path name with range binned partitioned Parquet format into a GenotypeDataset. * * @param pathName The path name to load alignment records from. * Globs/directories are supported. @@ -2339,15 +2339,15 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param optLookbackPartitions Number of partitions to lookback to find beginning of an overlapping * region when using the filterByOverlappingRegions function on the returned dataset. * Defaults to one partition. - * @return Returns a GenotypeRDD. + * @return Returns a GenotypeDataset. */ def loadPartitionedParquetGenotypes(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, - optLookbackPartitions: Option[Int] = Some(1)): GenotypeRDD = { + optLookbackPartitions: Option[Int] = Some(1)): GenotypeDataset = { val partitionedBinSize = getPartitionBinSize(pathName) val genotypes = loadParquetGenotypes(pathName) - val genotypesDatasetBound = DatasetBoundGenotypeRDD(genotypes.dataset, + val genotypesDatasetBound = DatasetBoundGenotypeDataset(genotypes.dataset, genotypes.sequences, genotypes.samples, genotypes.headerLines, @@ -2360,14 +2360,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name in VCF or Parquet + Avro format into a VariantContextRDD. + * Load a path name in VCF or Parquet format into a VariantContextDataset. * * @param pathName The path name to load variant context records from. * Globs/directories are supported. - * @return Returns a VariantContextRDD. + * @return Returns a VariantContextDataset. */ def loadVariantContexts( - pathName: String): VariantContextRDD = { + pathName: String): VariantContextDataset = { if (isVcfExt(pathName)) { loadVcf(pathName) @@ -2377,14 +2377,14 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name in Parquet + Avro format into a VariantContextRDD. + * Load a path name in Parquet + Avro format into a VariantContextDataset. * * @param pathName The path name to load variant context records from. * Globs/directories are supported. - * @return Returns a VariantContextRDD. + * @return Returns a VariantContextDataset. */ def loadParquetVariantContexts( - pathName: String): VariantContextRDD = { + pathName: String): VariantContextDataset = { // load header lines val headers = loadHeaderLines(pathName) @@ -2399,11 +2399,11 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log import sqlContext.implicits._ val ds = sqlContext.read.parquet(pathName).as[VariantContextProduct] - new DatasetBoundVariantContextRDD(ds, sd, samples, headers) + new DatasetBoundVariantContextDataset(ds, sd, samples, headers) } /** - * Load a path name with range binned partitioned Parquet format into a VariantContextRDD. + * Load a path name with range binned partitioned Parquet format into a VariantContextDataset. * * @param pathName The path name to load variant context records from. * Globs/directories are supported. @@ -2411,15 +2411,15 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param optLookbackPartitions Number of partitions to lookback to find beginning of an overlapping * region when using the filterByOverlappingRegions function on the returned dataset. * Defaults to one partition. - * @return Returns a VariantContextRDD. + * @return Returns a VariantContextDataset. */ def loadPartitionedParquetVariantContexts(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, - optLookbackPartitions: Option[Int] = Some(1)): VariantContextRDD = { + optLookbackPartitions: Option[Int] = Some(1)): VariantContextDataset = { val partitionedBinSize = getPartitionBinSize(pathName) val variantContexts = loadParquetVariantContexts(pathName) - val variantContextsDatasetBound = DatasetBoundVariantContextRDD(variantContexts.dataset, + val variantContextsDatasetBound = DatasetBoundVariantContextDataset(variantContexts.dataset, variantContexts.sequences, variantContexts.samples, variantContexts.headerLines, @@ -2432,7 +2432,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name in Parquet + Avro format into a VariantRDD. + * Load a path name in Parquet format into a VariantDataset. * * @param pathName The path name to load variants from. * Globs/directories are supported. @@ -2440,12 +2440,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param optProjection An option projection schema to use when reading Parquet + Avro. * Defaults to None. - * @return Returns a VariantRDD. + * @return Returns a VariantDataset. */ def loadParquetVariants( pathName: String, optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): VariantRDD = { + optProjection: Option[Schema] = None): VariantDataset = { val sd = loadAvroSequenceDictionary(pathName) @@ -2454,18 +2454,18 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log (optPredicate, optProjection) match { case (None, None) => { - new ParquetUnboundVariantRDD(sc, pathName, sd, headers) + new ParquetUnboundVariantDataset(sc, pathName, sd, headers) } case _ => { val rdd = loadParquet[Variant](pathName, optPredicate, optProjection) - new RDDBoundVariantRDD(rdd, sd, headers, + new RDDBoundVariantDataset(rdd, sd, headers, optPartitionMap = extractPartitionMap(pathName)) } } } /** - * Load a path name with range binned partitioned Parquet format into a VariantRDD. + * Load a path name with range binned partitioned Parquet format into a VariantDataset. * * @param pathName The path name to load alignment records from. * Globs/directories are supported. @@ -2473,15 +2473,15 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param optLookbackPartitions Number of partitions to lookback to find beginning of an overlapping * region when using the filterByOverlappingRegions function on the returned dataset. * Defaults to one partition. - * @return Returns a VariantRDD. + * @return Returns a VariantDataset. */ def loadPartitionedParquetVariants(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, - optLookbackPartitions: Option[Int] = Some(1)): VariantRDD = { + optLookbackPartitions: Option[Int] = Some(1)): VariantDataset = { val partitionedBinSize = getPartitionBinSize(pathName) val variants = loadParquetVariants(pathName) - val variantsDatasetBound = DatasetBoundVariantRDD(variants.dataset, + val variantsDatasetBound = DatasetBoundVariantDataset(variants.dataset, variants.sequences, variants.headerLines, isPartitioned = true, @@ -2493,17 +2493,17 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load nucleotide contig fragments from FASTA into a NucleotideContigFragmentRDD. + * Load nucleotide contig fragments from FASTA into a NucleotideContigFragmentDataset. * * @param pathName The path name to load nucleotide contig fragments from. * Globs/directories are supported. * @param maximumLength Maximum fragment length. Defaults to 10000L. Values greater * than 1e9 should be avoided. - * @return Returns a NucleotideContigFragmentRDD. + * @return Returns a NucleotideContigFragmentDataset. */ def loadFasta( pathName: String, - maximumLength: Long = 10000L): NucleotideContigFragmentRDD = LoadFasta.time { + maximumLength: Long = 10000L): NucleotideContigFragmentDataset = LoadFasta.time { val fastaData: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile( pathName, @@ -2519,12 +2519,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log val fragmentRdd = FastaConverter(remapData, maximumLength) .cache() - NucleotideContigFragmentRDD(fragmentRdd) + NucleotideContigFragmentDataset(fragmentRdd) } /** * Load paired unaligned alignment records grouped by sequencing fragment - * from interleaved FASTQ into an FragmentRDD. + * from interleaved FASTQ into an FragmentDataset. * * In interleaved FASTQ, the two reads from a paired sequencing protocol are * interleaved in a single file. This is a zipped representation of the @@ -2535,11 +2535,11 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * * @param pathName The path name to load unaligned alignment records from. * Globs/directories are supported. - * @return Returns a FragmentRDD containing the paired reads grouped by + * @return Returns a FragmentDataset containing the paired reads grouped by * sequencing fragment. */ def loadInterleavedFastqAsFragments( - pathName: String): FragmentRDD = LoadInterleavedFastqFragments.time { + pathName: String): FragmentDataset = LoadInterleavedFastqFragments.time { val job = HadoopUtil.newJob(sc) val conf = ContextUtil.getConfiguration(job) @@ -2557,12 +2557,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log // convert records val fastqRecordConverter = new FastqRecordConverter - FragmentRDD.fromRdd(records.map(fastqRecordConverter.convertFragment)) + FragmentDataset.fromRdd(records.map(fastqRecordConverter.convertFragment)) } /** * Load paired unaligned alignment records grouped by sequencing fragment - * from paired FASTQ files into an FragmentRDD. + * from paired FASTQ files into an FragmentDataset. * * Fragments represent all of the reads from a single sequenced fragment as * a single object, which is a useful representation for some tasks. @@ -2578,7 +2578,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * validation. Defaults to StorageLevel.MEMORY_ONLY. * @param stringency The validation stringency to use when validating paired FASTQ format. * Defaults to ValidationStringency.STRICT. - * @return Returns a FragmentRDD containing the paired reads grouped by + * @return Returns a FragmentDataset containing the paired reads grouped by * sequencing fragment. */ def loadPairedFastqAsFragments( @@ -2586,13 +2586,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log pathName2: String, optRecordGroup: Option[String] = None, persistLevel: Option[StorageLevel] = Some(StorageLevel.MEMORY_ONLY), - stringency: ValidationStringency = ValidationStringency.STRICT): FragmentRDD = LoadPairedFastqFragments.time { + stringency: ValidationStringency = ValidationStringency.STRICT): FragmentDataset = LoadPairedFastqFragments.time { loadPairedFastq(pathName1, pathName2, optRecordGroup, persistLevel, stringency).toFragments() } /** - * Load features into a FeatureRDD and convert to a CoverageRDD. + * Load features into a FeatureDataset and convert to a CoverageDataset. * Coverage is stored in the score field of Feature. * * Loads path names ending in: @@ -2628,7 +2628,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param stringency The validation stringency to use when validating BED6/12, GFF3, * GTF/GFF2, NarrowPeak, or IntervalList formats. Defaults to ValidationStringency.STRICT. - * @return Returns a FeatureRDD converted to a CoverageRDD. + * @return Returns a FeatureDataset converted to a CoverageDataset. */ def loadCoverage( pathName: String, @@ -2636,7 +2636,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log optMinPartitions: Option[Int] = None, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): CoverageRDD = LoadCoverage.time { + stringency: ValidationStringency = ValidationStringency.STRICT): CoverageDataset = LoadCoverage.time { loadFeatures(pathName, optSequenceDictionary = optSequenceDictionary, @@ -2647,7 +2647,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name in Parquet + Avro format into a FeatureRDD and convert to a CoverageRDD. + * Load a path name in Parquet + Avro format into a FeatureDataset and convert to a CoverageDataset. * Coverage is stored in the score field of Feature. * * @param pathName The path name to load features from. @@ -2655,19 +2655,19 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. * Defaults to None. * @param forceRdd Forces loading the RDD. - * @return Returns a FeatureRDD converted to a CoverageRDD. + * @return Returns a FeatureDataset converted to a CoverageDataset. */ def loadParquetCoverage( pathName: String, optPredicate: Option[FilterPredicate] = None, - forceRdd: Boolean = false): CoverageRDD = { + forceRdd: Boolean = false): CoverageDataset = { if (optPredicate.isEmpty && !forceRdd) { // convert avro to sequence dictionary val sd = loadAvroSequenceDictionary(pathName) val samples = loadAvroSamples(pathName) - new ParquetUnboundCoverageRDD(sc, pathName, sd, samples) + new ParquetUnboundCoverageDataset(sc, pathName, sd, samples) } else { val coverageFields = Projection(FeatureField.contigName, FeatureField.start, @@ -2682,7 +2682,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name in GFF3 format into a FeatureRDD. + * Load a path name in GFF3 format into a FeatureDataset. * * @param pathName The path name to load features in GFF3 format from. * Globs/directories are supported. @@ -2691,24 +2691,24 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * not set, falls back to the configured Spark default parallelism. Defaults to None. * @param stringency The validation stringency to use when validating GFF3 format. * Defaults to ValidationStringency.STRICT. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadGff3( pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): FeatureRDD = LoadGff3.time { + stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset = LoadGff3.time { val records = sc.textFile(pathName, optMinPartitions.getOrElse(sc.defaultParallelism)) .flatMap(new GFF3Parser().parse(_, stringency)) if (Metrics.isRecording) records.instrument() else records optSequenceDictionary - .fold(FeatureRDD(records))(FeatureRDD(records, _, Seq.empty)) + .fold(FeatureDataset(records))(FeatureDataset(records, _, Seq.empty)) } /** - * Load a path name in GTF/GFF2 format into a FeatureRDD. + * Load a path name in GTF/GFF2 format into a FeatureDataset. * * @param pathName The path name to load features in GTF/GFF2 format from. * Globs/directories are supported. @@ -2717,24 +2717,24 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * not set, falls back to the configured Spark default parallelism. Defaults to None. * @param stringency The validation stringency to use when validating GTF/GFF2 format. * Defaults to ValidationStringency.STRICT. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadGtf( pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): FeatureRDD = LoadGtf.time { + stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset = LoadGtf.time { val records = sc.textFile(pathName, optMinPartitions.getOrElse(sc.defaultParallelism)) .flatMap(new GTFParser().parse(_, stringency)) if (Metrics.isRecording) records.instrument() else records optSequenceDictionary - .fold(FeatureRDD(records))(FeatureRDD(records, _, Seq.empty)) + .fold(FeatureDataset(records))(FeatureDataset(records, _, Seq.empty)) } /** - * Load a path name in BED6/12 format into a FeatureRDD. + * Load a path name in BED6/12 format into a FeatureDataset. * * @param pathName The path name to load features in BED6/12 format from. * Globs/directories are supported. @@ -2743,24 +2743,24 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * not set, falls back to the configured Spark default parallelism. Defaults to None. * @param stringency The validation stringency to use when validating BED6/12 format. * Defaults to ValidationStringency.STRICT. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadBed( pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): FeatureRDD = LoadBed.time { + stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset = LoadBed.time { val records = sc.textFile(pathName, optMinPartitions.getOrElse(sc.defaultParallelism)) .flatMap(new BEDParser().parse(_, stringency)) if (Metrics.isRecording) records.instrument() else records optSequenceDictionary - .fold(FeatureRDD(records))(FeatureRDD(records, _, Seq.empty)) + .fold(FeatureDataset(records))(FeatureDataset(records, _, Seq.empty)) } /** - * Load a path name in NarrowPeak format into a FeatureRDD. + * Load a path name in NarrowPeak format into a FeatureDataset. * * @param pathName The path name to load features in NarrowPeak format from. * Globs/directories are supported. @@ -2769,24 +2769,24 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * not set, falls back to the configured Spark default parallelism. Defaults to None. * @param stringency The validation stringency to use when validating NarrowPeak format. * Defaults to ValidationStringency.STRICT. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadNarrowPeak( pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): FeatureRDD = LoadNarrowPeak.time { + stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset = LoadNarrowPeak.time { val records = sc.textFile(pathName, optMinPartitions.getOrElse(sc.defaultParallelism)) .flatMap(new NarrowPeakParser().parse(_, stringency)) if (Metrics.isRecording) records.instrument() else records optSequenceDictionary - .fold(FeatureRDD(records))(FeatureRDD(records, _, Seq.empty)) + .fold(FeatureDataset(records))(FeatureDataset(records, _, Seq.empty)) } /** - * Load a path name in IntervalList format into a FeatureRDD. + * Load a path name in IntervalList format into a FeatureDataset. * * @param pathName The path name to load features in IntervalList format from. * Globs/directories are supported. @@ -2794,12 +2794,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * not set, falls back to the configured Spark default parallelism. Defaults to None. * @param stringency The validation stringency to use when validating IntervalList format. * Defaults to ValidationStringency.STRICT. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadIntervalList( pathName: String, optMinPartitions: Option[Int] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): FeatureRDD = LoadIntervalList.time { + stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset = LoadIntervalList.time { val parsedLines = sc.textFile(pathName, optMinPartitions.getOrElse(sc.defaultParallelism)) .map(new IntervalListParser().parseWithHeader(_, stringency)) @@ -2807,11 +2807,11 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log parsedLines.flatMap(_._2)) if (Metrics.isRecording) records.instrument() else records - FeatureRDD(records, seqDict, Seq.empty) + FeatureDataset(records, seqDict, Seq.empty) } /** - * Load a path name in Parquet + Avro format into a FeatureRDD. + * Load a path name in Parquet + Avro format into a FeatureDataset. * * @param pathName The path name to load features from. * Globs/directories are supported. @@ -2819,31 +2819,30 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param optProjection An option projection schema to use when reading Parquet + Avro. * Defaults to None. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadParquetFeatures( pathName: String, optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): FeatureRDD = { + optProjection: Option[Schema] = None): FeatureDataset = { val sd = loadAvroSequenceDictionary(pathName) val samples = loadAvroSamples(pathName) (optPredicate, optProjection) match { case (None, None) => { - ParquetUnboundFeatureRDD(sc, pathName, sd, samples) + ParquetUnboundFeatureDataset(sc, pathName, sd, samples) } case (_, _) => { // load from disk val rdd = loadParquet[Feature](pathName, optPredicate, optProjection) - - new RDDBoundFeatureRDD(rdd, sd, samples, optPartitionMap = extractPartitionMap(pathName)) + new RDDBoundFeatureDataset(rdd, sd, samples, optPartitionMap = extractPartitionMap(pathName)) } } } /** - * Load a path name with range binned partitioned Parquet format into a FeatureRDD. + * Load a path name with range binned partitioned Parquet format into a FeatureDataset. * * @param pathName The path name to load alignment records from. * Globs/directories are supported. @@ -2851,15 +2850,15 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param optLookbackPartitions Number of partitions to lookback to find beginning of an overlapping * region when using the filterByOverlappingRegions function on the returned dataset. * Defaults to one partition. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadPartitionedParquetFeatures(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, - optLookbackPartitions: Option[Int] = Some(1)): FeatureRDD = { + optLookbackPartitions: Option[Int] = Some(1)): FeatureDataset = { val partitionedBinSize = getPartitionBinSize(pathName) val features = loadParquetFeatures(pathName) - val featureDatasetBound = DatasetBoundFeatureRDD(features.dataset, + val featureDatasetBound = DatasetBoundFeatureDataset(features.dataset, features.sequences, features.samples, isPartitioned = true, @@ -2871,7 +2870,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name in Parquet + Avro format into a NucleotideContigFragmentRDD. + * Load a path name in Parquet + Avro format into a NucleotideContigFragmentDataset. * * @param pathName The path name to load nucleotide contig fragments from. * Globs/directories are supported. @@ -2879,23 +2878,23 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param optProjection An option projection schema to use when reading Parquet + Avro. * Defaults to None. - * @return Returns a NucleotideContigFragmentRDD. + * @return Returns a NucleotideContigFragmentDataset. */ def loadParquetContigFragments( pathName: String, optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): NucleotideContigFragmentRDD = { + optProjection: Option[Schema] = None): NucleotideContigFragmentDataset = { val sd = loadAvroSequenceDictionary(pathName) (optPredicate, optProjection) match { case (None, None) => { - ParquetUnboundNucleotideContigFragmentRDD( + ParquetUnboundNucleotideContigFragmentDataset( sc, pathName, sd) } case (_, _) => { val rdd = loadParquet[NucleotideContigFragment](pathName, optPredicate, optProjection) - new RDDBoundNucleotideContigFragmentRDD(rdd, + new RDDBoundNucleotideContigFragmentDataset(rdd, sd, optPartitionMap = extractPartitionMap(pathName)) } @@ -2903,7 +2902,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name with range binned partitioned Parquet format into a NucleotideContigFragmentRDD. + * Load a path name with range binned partitioned Parquet format into a NucleotideContigFragmentDataset. * * @param pathName The path name to load alignment records from. * Globs/directories are supported. @@ -2911,15 +2910,15 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param optLookbackPartitions Number of partitions to lookback to find beginning of an overlapping * region when using the filterByOverlappingRegions function on the returned dataset. * Defaults to one partition. - * @return Returns a NucleotideContigFragmentRDD. + * @return Returns a NucleotideContigFragmentDataset. */ def loadPartitionedParquetContigFragments(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, - optLookbackPartitions: Option[Int] = Some(1)): NucleotideContigFragmentRDD = { + optLookbackPartitions: Option[Int] = Some(1)): NucleotideContigFragmentDataset = { val partitionedBinSize = getPartitionBinSize(pathName) val contigs = loadParquetContigFragments(pathName) - val contigsDatasetBound = DatasetBoundNucleotideContigFragmentRDD(contigs.dataset, + val contigsDatasetBound = DatasetBoundNucleotideContigFragmentDataset(contigs.dataset, contigs.sequences, isPartitioned = true, Some(partitionedBinSize), @@ -2930,7 +2929,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load a path name in Parquet + Avro format into a FragmentRDD. + * Load a path name in Parquet + Avro format into a FragmentDataset. * * @param pathName The path name to load fragments from. * Globs/directories are supported. @@ -2938,12 +2937,12 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param optProjection An option projection schema to use when reading Parquet + Avro. * Defaults to None. - * @return Returns a FragmentRDD. + * @return Returns a FragmentDataset. */ def loadParquetFragments( pathName: String, optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): FragmentRDD = { + optProjection: Option[Schema] = None): FragmentDataset = { // convert avro to sequence dictionary val sd = loadAvroSequenceDictionary(pathName) @@ -2956,13 +2955,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log (optPredicate, optProjection) match { case (None, None) => { - ParquetUnboundFragmentRDD(sc, pathName, sd, rgd, pgs) + ParquetUnboundFragmentDataset(sc, pathName, sd, rgd, pgs) } case (_, _) => { // load from disk val rdd = loadParquet[Fragment](pathName, optPredicate, optProjection) - new RDDBoundFragmentRDD(rdd, + new RDDBoundFragmentDataset(rdd, sd, rgd, pgs, @@ -2972,7 +2971,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load features into a FeatureRDD. + * Load features into a FeatureDataset. * * Loads path names ending in: * * .bed as BED6/12 format, @@ -3007,7 +3006,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param stringency The validation stringency to use when validating BED6/12, GFF3, * GTF/GFF2, NarrowPeak, or IntervalList formats. Defaults to ValidationStringency.STRICT. - * @return Returns a FeatureRDD. + * @return Returns a FeatureDataset. */ def loadFeatures( pathName: String, @@ -3015,7 +3014,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log optMinPartitions: Option[Int] = None, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): FeatureRDD = LoadFeatures.time { + stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset = LoadFeatures.time { val trimmedPathName = trimExtensionIfCompressed(pathName) if (isBedExt(trimmedPathName)) { @@ -3113,7 +3112,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load nucleotide contig fragments into a NucleotideContigFragmentRDD. + * Load nucleotide contig fragments into a NucleotideContigFragmentDataset. * * If the path name has a .fa/.fasta extension, load as FASTA format. * Else, fall back to Parquet + Avro. @@ -3133,13 +3132,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param optProjection An option projection schema to use when reading Parquet + Avro. * Defaults to None. - * @return Returns a NucleotideContigFragmentRDD. + * @return Returns a NucleotideContigFragmentDataset. */ def loadContigFragments( pathName: String, maximumLength: Long = 10000L, optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): NucleotideContigFragmentRDD = LoadContigFragments.time { + optProjection: Option[Schema] = None): NucleotideContigFragmentDataset = LoadContigFragments.time { val trimmedPathName = trimExtensionIfCompressed(pathName) if (isFastaExt(trimmedPathName)) { @@ -3155,7 +3154,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load genotypes into a GenotypeRDD. + * Load genotypes into a GenotypeDataset. * * If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. * Else, fall back to Parquet + Avro. @@ -3172,13 +3171,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param stringency The validation stringency to use when validating VCF format. * Defaults to ValidationStringency.STRICT. - * @return Returns a GenotypeRDD. + * @return Returns a GenotypeDataset. */ def loadGenotypes( pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): GenotypeRDD = LoadGenotypes.time { + stringency: ValidationStringency = ValidationStringency.STRICT): GenotypeDataset = LoadGenotypes.time { if (isVcfExt(pathName)) { log.info(s"Loading $pathName as VCF and converting to Genotypes.") @@ -3190,7 +3189,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load variants into a VariantRDD. + * Load variants into a VariantDataset. * * If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. * Else, fall back to Parquet + Avro. @@ -3206,13 +3205,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param stringency The validation stringency to use when validating VCF format. * Defaults to ValidationStringency.STRICT. - * @return Returns a VariantRDD. + * @return Returns a VariantDataset. */ def loadVariants( pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): VariantRDD = LoadVariants.time { + stringency: ValidationStringency = ValidationStringency.STRICT): VariantDataset = LoadVariants.time { if (isVcfExt(pathName)) { log.info(s"Loading $pathName as VCF and converting to Variants.") @@ -3224,7 +3223,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load alignment records into an AlignmentRecordRDD. + * Load alignment records into an AlignmentRecordDataset. * * Loads path names ending in: * * .bam/.cram/.sam as BAM/CRAM/SAM format, @@ -3258,7 +3257,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param stringency The validation stringency to use when validating BAM/CRAM/SAM or FASTQ formats. * Defaults to ValidationStringency.STRICT. - * @return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, + * @return Returns an AlignmentRecordDataset which wraps the genomic dataset of alignment records, * sequence dictionary representing contigs the alignment records may be aligned to, * and the record group dictionary for the alignment records if one is available. */ @@ -3268,7 +3267,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log optRecordGroup: Option[String] = None, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD = LoadAlignments.time { + stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordDataset = LoadAlignments.time { // need this to pick up possible .bgz extension sc.hadoopConfiguration.setStrings("io.compression.codecs", @@ -3286,7 +3285,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log loadFastq(pathName, optPathName2, optRecordGroup, stringency) } else if (isFastaExt(trimmedPathName)) { log.info(s"Loading $pathName as FASTA and converting to AlignmentRecords.") - AlignmentRecordRDD.unaligned(loadFasta(pathName, maximumLength = 10000L).toReads) + AlignmentRecordDataset.unaligned(loadFasta(pathName, maximumLength = 10000L).toReads) } else { log.info(s"Loading $pathName as Parquet of AlignmentRecords.") loadParquetAlignments(pathName, optPredicate = optPredicate, optProjection = optProjection) @@ -3294,7 +3293,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } /** - * Load fragments into a FragmentRDD. + * Load fragments into a FragmentDataset. * * Loads path names ending in: * * .bam/.cram/.sam as BAM/CRAM/SAM format and @@ -3319,13 +3318,13 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * Defaults to None. * @param stringency The validation stringency to use when validating BAM/CRAM/SAM or FASTQ formats. * Defaults to ValidationStringency.STRICT. - * @return Returns a FragmentRDD. + * @return Returns a FragmentDataset. */ def loadFragments( pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, - stringency: ValidationStringency = ValidationStringency.STRICT): FragmentRDD = LoadFragments.time { + stringency: ValidationStringency = ValidationStringency.STRICT): FragmentDataset = LoadFragments.time { // need this to pick up possible .bgz extension sc.hadoopConfiguration.setStrings("io.compression.codecs", diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicDataset.scala similarity index 71% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicDataset.scala index 3514028491..970c4f8953 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicDataset.scala @@ -40,7 +40,6 @@ import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.{ SparkContext, SparkFiles } import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.function.{ Function => JFunction, Function2 } -import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.{ InstrumentedOutputFormat, RDD } import org.apache.spark.sql.{ DataFrame, Dataset, SQLContext } import org.apache.spark.sql.functions._ @@ -124,9 +123,9 @@ private[rdd] object GenomicDataset { } /** - * A trait that wraps an RDD of genomic data with helpful metadata. + * A trait that wraps an RDD or Dataset of genomic data with helpful metadata. * - * @tparam T The type of the data in the wrapped RDD. + * @tparam T The type of the data in the wrapped RDD or Dataset. * @tparam U The type of this GenomicDataset. */ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logging { @@ -142,7 +141,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg protected val unproductFn: U => T /** - * @return This data as a Spark SQL DataFrame. + * @return These data as a Spark SQL DataFrame. */ def toDF(): DataFrame = { dataset.toDF() @@ -152,8 +151,8 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * Applies a function that transforms the underlying Dataset into a new Dataset * using the Spark SQL API. * - * @param tFn A function that transforms the underlying RDD as a Dataset. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying Dataset as a Dataset. + * @return A new genomic dataset where the Dataset of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transformDataset(tFn: Dataset[U] => Dataset[U]): V @@ -162,8 +161,8 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * Applies a function that transforms the underlying DataFrame into a new DataFrame * using the Spark SQL API. * - * @param tFn A function that transforms the underlying RDD as a DataFrame. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying data as a DataFrame. + * @return A new genomic dataset where the DataFrame of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transformDataFrame(tFn: DataFrame => DataFrame)( @@ -179,8 +178,8 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * Applies a function that transforms the underlying DataFrame into a new DataFrame * using the Spark SQL API. Java-friendly variant. * - * @param tFn A function that transforms the underlying RDD as a DataFrame. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying DataFrame as a DataFrame. + * @return A new genomic dataset where the DataFrame of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transformDataFrame(tFn: JFunction[DataFrame, DataFrame]): V = { @@ -190,11 +189,11 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Applies a function that transmutes the underlying RDD into a new RDD of a + * Applies a function that transmutes the underlying Dataset into a new Dataset of a * different type. * - * @param tFn A function that transforms the underlying RDD. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying Dataset. + * @return A new genomic dataset where the Dataset of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transmuteDataset[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( @@ -205,11 +204,11 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Applies a function that transmutes the underlying RDD into a new RDD of a + * Applies a function that transmutes the underlying Dataset into a new Dataset of a * different type. Java friendly variant. * - * @param tFn A function that transforms the underlying RDD. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying Dataset. + * @return A new genomic dataset where the Dataset of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transmuteDataset[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( @@ -221,11 +220,11 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Applies a function that transmutes the underlying RDD into a new RDD of a + * Applies a function that transmutes the underlying DataFrame into a new DataFrame of a * different type. Java friendly variant. * - * @param tFn A function that transforms the underlying RDD. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying DataFrame. + * @return A new genomic dataset where the DataFrame of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transmuteDataFrame[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( @@ -240,11 +239,11 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Applies a function that transmutes the underlying RDD into a new RDD of a + * Applies a function that transmutes the underlying DataFrame into a new DataFrame of a * different type. Java friendly variant. * - * @param tFn A function that transforms the underlying RDD. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying DataFrame. + * @return A new genomic dataset where the DataFrame of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transmuteDataFrame[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( @@ -304,7 +303,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Saves an RDD to Parquet. + * Saves a genomic dataset to Parquet. * * @param args The output format configuration to use when saving the data. */ @@ -317,7 +316,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Saves this RDD to disk as a Parquet + Avro file. + * Saves a genomic dataset to Parquet. * * @param pathName The path to save the file to. * @param blockSize The size in bytes of blocks to write. @@ -429,7 +428,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Appends sequence metadata to the current RDD. + * Appends sequence metadata to the current genomic dataset. * * @param sequencesToAdd The new sequences to append. * @return Returns a new GenomicDataset with the sequences appended. @@ -439,7 +438,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Appends metadata for a single sequence to the current RDD. + * Appends metadata for a single sequence to the current genomic dataset. * * @param sequenceToAdd The sequence to add. * @return Returns a new GenomicDataset with this sequence appended. @@ -471,27 +470,27 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Unions together multiple genomic RDDs. + * Unions together multiple genomic datasets. * - * @param rdds RDDs to union with this RDD. + * @param datasets Genomic datasets to union with this genomic dataset. */ - def union(rdds: V*): V + def union(datasets: V*): V /** - * Unions together multiple genomic RDDs. + * Unions together multiple genomic datasets. * - * @param rdds RDDs to union with this RDD. + * @param datasets Genomic datasets to union with this genomic dataset. */ - def union(rdds: java.util.List[V]): V = { - val rddSeq: Seq[V] = rdds.toSeq - union(rddSeq: _*) + def union(datasets: java.util.List[V]): V = { + val datasetSeq: Seq[V] = datasets.toSeq + union(datasetSeq: _*) } /** * Applies a function that transforms the underlying RDD into a new RDD. * * @param tFn A function that transforms the underlying RDD. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @return A new genomic dataset where the RDD of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transform(tFn: RDD[T] => RDD[T]): V = { @@ -500,9 +499,10 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg /** * Applies a function that transforms the underlying RDD into a new RDD. + * Java friendly variant. * * @param tFn A function that transforms the underlying RDD. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @return A new genomic dataset where the RDD of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transform(tFn: JFunction[JavaRDD[T], JavaRDD[T]]): V = { @@ -514,7 +514,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * different type. * * @param tFn A function that transforms the underlying RDD. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @return A new genomic dataset where the RDD of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transmute[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]](tFn: RDD[T] => RDD[X])( @@ -524,11 +524,11 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg /** * Applies a function that transmutes the underlying RDD into a new RDD of a - * different type. Java friendly version. + * different type. Java friendly variant. * * @param tFn A function that transforms the underlying RDD. * @param convFn The conversion function used to build the final RDD. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @return A new genomid dataset where the RDD of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) are copied without modification. */ def transmute[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( @@ -616,7 +616,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * Sorts our genome aligned data by reference positions, with contigs ordered * by index. * - * @return Returns a new RDD containing sorted data. + * @return Returns a new genomic dataset containing sorted data. * * @see sortLexicographically */ @@ -629,9 +629,9 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * Sorts our genome aligned data by reference positions, with contigs ordered * by index. * - * @param partitions The number of partitions for the new RDD. + * @param partitions The number of partitions for the new genomic dataset. * @param stringency The level of ValidationStringency to enforce. - * @return Returns a new RDD containing sorted data. + * @return Returns a new genomic dataset containing sorted data. * * @note Uses ValidationStringency to handle unaligned or where objects align * to multiple positions. @@ -678,7 +678,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * Sorts our genome aligned data by reference positions, with contigs ordered * lexicographically. * - * @return Returns a new RDD containing sorted data. + * @return Returns a new genomic dataset containing sorted data. * * @see sort */ @@ -690,12 +690,12 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * Sorts our genome aligned data by reference positions, with contigs ordered * lexicographically. * - * @param partitions The number of partitions for the new RDD. + * @param partitions The number of partitions for the new genomic dataset. * @param storePartitionMap A Boolean flag to determine whether to store the - * partition bounds from the resulting RDD. - * @param storageLevel The level at which to persist the resulting RDD. + * partition bounds from the resulting genomic dataset. + * @param storageLevel The level at which to persist the resulting genomic dataset. * @param stringency The level of ValidationStringency to enforce. - * @return Returns a new RDD containing sorted data. + * @return Returns a new genomic dataset containing sorted data. * * @note Uses ValidationStringency to handle data that is unaligned or where objects * align to multiple positions. @@ -713,12 +713,12 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg // We don't use ValidationStringency here because multimapped elements // break downstream methods. require(coveredRegions.size <= 1, - "Cannot sort RDD containing a multimapped element. %s covers %s.".format( + "Cannot sort genomic dataset containing a multimapped element. %s covers %s.".format( elem, coveredRegions.mkString(","))) if (coveredRegions.isEmpty) { throwWarnOrNone[(ReferenceRegion, T)]( - "Cannot sort RDD containing an unmapped element %s.".format(elem), + "Cannot sort genomic dataset containing an unmapped element %s.".format(elem), stringency) } else { Some(coveredRegions.head, elem) @@ -917,7 +917,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * @param flankSize Number of bases to flank each command invocation by. * @param tFormatter Class of formatter for data going into pipe command. * @param xFormatter Formatter for data coming out of the pipe command. - * @param convFn The conversion function used to build the final RDD. + * @param convFn The conversion function used to build the final genomic dataset. * @return Returns a new GenomicDataset of type Y. * * @tparam X The type of the record created by the piped command. @@ -956,7 +956,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * @param flankSize Number of bases to flank each command invocation by. * @param tFormatter Class of formatter for data going into pipe command. * @param xFormatter Formatter for data coming out of the pipe command. - * @param convFn The conversion function used to build the final RDD. + * @param convFn The conversion function used to build the final genomic dataset. * @return Returns a new GenomicDataset of type Y. * * @tparam X The type of the record created by the piped command. @@ -998,7 +998,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg pipe[X, Y, Z, W](cmd, files.toSeq, environment.toMap, flankSize)( tFormatterCompanion, xFormatter, - (gRdd: V, rdd: RDD[X]) => convFn.call(gRdd, rdd), + (gDataset: V, rdd: RDD[X]) => convFn.call(gDataset, rdd), ClassTag.AnyRef.asInstanceOf[ClassTag[T]], ClassTag.AnyRef.asInstanceOf[ClassTag[X]]) } @@ -1058,7 +1058,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg * several genomic regions. Java friendly version. * * @param querys The regions to query for. - * @return Returns a new GenomicRDD containing only data that overlaps the + * @return Returns a new GenomicDataset containing only data that overlaps the * querys region. */ def filterByOverlappingRegions(querys: java.lang.Iterable[ReferenceRegion]): V = { @@ -1083,75 +1083,75 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. SparkR friendly version. + * genomic dataset are dropped. SparkR friendly version. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. */ def broadcastRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(T, X), (U, Y)] = { - broadcastRegionJoin(genomicRdd, flankSize.toInt: java.lang.Integer) + broadcastRegionJoin(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. Python/Java friendly version. + * genomic dataset are dropped. Python/Java friendly version. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. */ def broadcastRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(T, X), (U, Y)] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(T, X)]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(U, Y)] - broadcastRegionJoin(genomicRdd, flankSize.toLong) + broadcastRegionJoin(genomicDataset, flankSize.toLong) } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. + * genomic dataset are dropped. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see broadcastRegionJoinAgainst */ def broadcastRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], @@ -1161,58 +1161,58 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg // key the RDDs and join RDDBoundGenericGenomicDataset[(T, X), (U, Y)](InnerTreeRegionJoin[T, X]().broadcastAndJoin( buildTree(flattenRddByRegions().map(f => (f._1.pad(flankSize), f._2))), - genomicRdd.flattenRddByRegions()), - sequences ++ genomicRdd.sequences, + genomicDataset.flattenRddByRegions()), + sequences ++ genomicDataset.sequences, GenericConverter[(T, X), (U, Y)](kv => { // pad by -1 * flankSize to undo pad from preprocessing getReferenceRegions(kv._1).map(_.pad(-1 * flankSize)) ++ - genomicRdd.getReferenceRegions(kv._2) + genomicDataset.getReferenceRegions(kv._2) }, - kv => (productFn(kv._1), genomicRdd.productFn(kv._2)), - kv => (unproductFn(kv._1), genomicRdd.unproductFn(kv._2))), + kv => (productFn(kv._1), genomicDataset.productFn(kv._2)), + kv => (unproductFn(kv._1), genomicDataset.unproductFn(kv._2))), TagHolder[(T, X), (U, Y)]()) } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. + * genomic dataset are dropped. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see broadcastRegionJoinAgainst */ def broadcastRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], txTag: ClassTag[(T, X)], uyTag: TypeTag[(U, Y)]): GenericGenomicDataset[(T, X), (U, Y)] = { - broadcastRegionJoin(genomicRdd, 0L) + broadcastRegionJoin(genomicDataset, 0L) } /** - * Performs a broadcast inner join between this RDD and data that has been broadcast. + * Performs a broadcast inner join between this genomic dataset and data that has been broadcast. * * In a broadcast join, the left side of the join (broadcastTree) is broadcast to * to all the nodes in the cluster. The key equality * function used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. As compared to broadcastRegionJoin, this function allows the + * genomic dataset are dropped. As compared to broadcastRegionJoin, this function allows the * broadcast object to be reused across multiple joins. * - * @note This function differs from other region joins as it treats the calling RDD + * @note This function differs from other region joins as it treats the calling genomic dataset * as the right side of the join, and not the left. * - * @param broadcastTree The data on the left side of the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param broadcast The data on the left side of the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see broadcastRegionJoin @@ -1237,83 +1237,83 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this - * is a right outer join, all values in the left RDD that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does - * not overlap any values in the left RDD, it will be paired with a `None` + * is a right outer join, all values in the left genomic dataset that do not overlap a + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does + * not overlap any values in the left genomic dataset, it will be paired with a `None` * in the product of the join. SparkR friendly version. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. */ def rightOuterBroadcastRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(Option[T], X), (Option[U], Y)] = { - rightOuterBroadcastRegionJoin(genomicRdd, flankSize.toInt: java.lang.Integer) + rightOuterBroadcastRegionJoin(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this - * is a right outer join, all values in the left RDD that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does - * not overlap any values in the left RDD, it will be paired with a `None` + * is a right outer join, all values in the left genomic dataset that do not overlap a + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does + * not overlap any values in the left genomic dataset, it will be paired with a `None` * in the product of the join. PySpark/Java friendly version. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. */ def rightOuterBroadcastRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(Option[T], X), (Option[U], Y)] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(Option[T], X)]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(Option[U], Y)] - rightOuterBroadcastRegionJoin(genomicRdd, flankSize.toLong) + rightOuterBroadcastRegionJoin(genomicDataset, flankSize.toLong) } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this - * is a right outer join, all values in the left RDD that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does - * not overlap any values in the left RDD, it will be paired with a `None` + * is a right outer join, all values in the left genomic dataset that do not overlap a + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does + * not overlap any values in the left genomic dataset, it will be paired with a `None` * in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. * * @see rightOuterBroadcastRegionJoin */ def rightOuterBroadcastRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], @@ -1323,36 +1323,36 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg // key the RDDs and join RDDBoundGenericGenomicDataset[(Option[T], X), (Option[U], Y)](RightOuterTreeRegionJoin[T, X]().broadcastAndJoin( buildTree(flattenRddByRegions().map(f => (f._1.pad(flankSize), f._2))), - genomicRdd.flattenRddByRegions()), - sequences ++ genomicRdd.sequences, + genomicDataset.flattenRddByRegions()), + sequences ++ genomicDataset.sequences, GenericConverter[(Option[T], X), (Option[U], Y)](kv => { // pad by -1 * flankSize to undo pad from preprocessing Seq(kv._1.map(v => getReferenceRegions(v) .map(_.pad(-1 * flankSize)))).flatten.flatten ++ - genomicRdd.getReferenceRegions(kv._2) + genomicDataset.getReferenceRegions(kv._2) }, - kv => (kv._1.map(productFn), genomicRdd.productFn(kv._2)), - kv => (kv._1.map(unproductFn), genomicRdd.unproductFn(kv._2))), + kv => (kv._1.map(productFn), genomicDataset.productFn(kv._2)), + kv => (kv._1.map(unproductFn), genomicDataset.unproductFn(kv._2))), TagHolder[(Option[T], X), (Option[U], Y)]()) } /** - * Performs a broadcast right outer join between this RDD and data that has been broadcast. + * Performs a broadcast right outer join between this genomic dataset and data that has been broadcast. * * In a broadcast join, the left side of the join (broadcastTree) is broadcast to * to all the nodes in the cluster. The key equality * function used for this join is the reference region overlap function. Since this * is a right outer join, all values in the left table that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does * not overlap any values in the left table, it will be paired with a `None` * in the product of the join. As compared to broadcastRegionJoin, this function allows the * broadcast object to be reused across multiple joins. * - * @note This function differs from other region joins as it treats the calling RDD + * @note This function differs from other region joins as it treats the calling genomic dataset * as the right side of the join, and not the left. * - * @param broadcastTree The data on the left side of the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param broadcast The data on the left side of the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see rightOuterBroadcastRegionJoin @@ -1377,107 +1377,107 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this - * is a right outer join, all values in the left RDD that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does - * not overlap any values in the left RDD, it will be paired with a `None` + * is a right outer join, all values in the left genomic dataset that do not overlap a + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does + * not overlap any values in the left genomic dataset, it will be paired with a `None` * in the product of the join. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. * * @see rightOuterBroadcastRegionJoin */ def rightOuterBroadcastRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], otxTag: ClassTag[(Option[T], X)], ouyTag: TypeTag[(Option[U], Y)]): GenericGenomicDataset[(Option[T], X), (Option[U], Y)] = { - rightOuterBroadcastRegionJoin(genomicRdd, 0L) + rightOuterBroadcastRegionJoin(genomicDataset, 0L) } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. SparkR friendly variant. + * genomic dataset are dropped. SparkR friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see broadcastRegionJoinAgainstAndGroupByRight */ def broadcastRegionJoinAndGroupByRight[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(Iterable[T], X), (Seq[U], Y)] = { - broadcastRegionJoinAndGroupByRight(genomicRdd, flankSize.toInt: java.lang.Integer) + broadcastRegionJoinAndGroupByRight(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. PySpark/Java friendly variant. + * genomic dataset are dropped. PySpark/Java friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see broadcastRegionJoinAgainstAndGroupByRight */ def broadcastRegionJoinAndGroupByRight[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(Iterable[T], X), (Seq[U], Y)] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(Iterable[T], X)]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(Seq[U], Y)] - broadcastRegionJoinAndGroupByRight(genomicRdd, flankSize.toLong) + broadcastRegionJoinAndGroupByRight(genomicDataset, flankSize.toLong) } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. + * genomic dataset are dropped. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see broadcastRegionJoinAgainstAndGroupByRight */ def broadcastRegionJoinAndGroupByRight[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], @@ -1487,34 +1487,34 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg // key the RDDs and join RDDBoundGenericGenomicDataset[(Iterable[T], X), (Seq[U], Y)](InnerTreeRegionJoinAndGroupByRight[T, X]().broadcastAndJoin( buildTree(flattenRddByRegions().map(f => (f._1.pad(flankSize), f._2))), - genomicRdd.flattenRddByRegions()), - sequences ++ genomicRdd.sequences, + genomicDataset.flattenRddByRegions()), + sequences ++ genomicDataset.sequences, GenericConverter[(Iterable[T], X), (Seq[U], Y)](kv => { // pad by -1 * flankSize to undo pad from preprocessing (kv._1.flatMap(getReferenceRegions) ++ - genomicRdd.getReferenceRegions(kv._2)) + genomicDataset.getReferenceRegions(kv._2)) .toSeq }, - kv => (kv._1.map(productFn).toSeq, genomicRdd.productFn(kv._2)), - kv => (kv._1.map(unproductFn), genomicRdd.unproductFn(kv._2))), + kv => (kv._1.map(productFn).toSeq, genomicDataset.productFn(kv._2)), + kv => (kv._1.map(unproductFn), genomicDataset.unproductFn(kv._2))), TagHolder[(Iterable[T], X), (Seq[U], Y)]()) } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * * In a broadcast join, the left side of the join (broadcastTree) is broadcast to * to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. As compared to broadcastRegionJoin, this function allows + * genomic dataset are dropped. As compared to broadcastRegionJoin, this function allows * the broadcast object to be reused across multiple joins. * - * @note This function differs from other region joins as it treats the calling RDD + * @note This function differs from other region joins as it treats the calling genomic dataset * as the right side of the join, and not the left. * - * @param broadcastTree The data on the left side of the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param broadcast The data on the left side of the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see broadcastRegionJoinAndGroupByRight @@ -1541,113 +1541,113 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Performs a broadcast inner join between this RDD and another RDD. + * Performs a broadcast inner join between this genomic dataset and another genomic dataset. * - * In a broadcast join, the left RDD (this RDD) is collected to the driver, + * In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, * and broadcast to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is an inner join, all values who do not overlap a value from the other - * RDD are dropped. + * genomic dataset are dropped. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see broadcastRegionJoinAgainstAndGroupByRight */ def broadcastRegionJoinAndGroupByRight[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], itxTag: ClassTag[(Iterable[T], X)], iuyTag: TypeTag[(Seq[U], Y)]): GenericGenomicDataset[(Iterable[T], X), (Seq[U], Y)] = { - broadcastRegionJoinAndGroupByRight(genomicRdd, 0L) + broadcastRegionJoinAndGroupByRight(genomicDataset, 0L) } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * * In a broadcast join, the left side of the join (broadcastTree) is broadcast to * to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this - * is a right outer join, all values in the left RDD that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does - * not overlap any values in the left RDD, it will be paired with a `None` + * is a right outer join, all values in the left genomic dataset that do not overlap a + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does + * not overlap any values in the left genomic dataset, it will be paired with a `None` * in the product of the join. SparkR friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. * * @see rightOuterBroadcastRegionJoinAgainstAndGroupByRight */ def rightOuterBroadcastRegionJoinAndGroupByRight[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(Iterable[T], X), (Seq[U], Y)] = { - rightOuterBroadcastRegionJoinAndGroupByRight(genomicRdd, flankSize.toInt: java.lang.Integer) + rightOuterBroadcastRegionJoinAndGroupByRight(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * * In a broadcast join, the left side of the join (broadcastTree) is broadcast to * to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this - * is a right outer join, all values in the left RDD that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does - * not overlap any values in the left RDD, it will be paired with a `None` + * is a right outer join, all values in the left genomic dataset that do not overlap a + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does + * not overlap any values in the left genomic dataset, it will be paired with a `None` * in the product of the join. PySpark/Java friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. * * @see rightOuterBroadcastRegionJoinAgainstAndGroupByRight */ def rightOuterBroadcastRegionJoinAndGroupByRight[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(Iterable[T], X), (Seq[U], Y)] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(Iterable[T], X)]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(Seq[U], Y)] - rightOuterBroadcastRegionJoinAndGroupByRight(genomicRdd, flankSize.toLong) + rightOuterBroadcastRegionJoinAndGroupByRight(genomicDataset, flankSize.toLong) } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * * In a broadcast join, the left side of the join (broadcastTree) is broadcast to * to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this - * is a right outer join, all values in the left RDD that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does - * not overlap any values in the left RDD, it will be paired with a `None` + * is a right outer join, all values in the left genomic dataset that do not overlap a + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does + * not overlap any values in the left genomic dataset, it will be paired with a `None` * in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. * * @see rightOuterBroadcastRegionJoinAgainstAndGroupByRight */ def rightOuterBroadcastRegionJoinAndGroupByRight[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], @@ -1657,36 +1657,36 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg // key the RDDs and join RDDBoundGenericGenomicDataset[(Iterable[T], X), (Seq[U], Y)](RightOuterTreeRegionJoinAndGroupByRight[T, X]().broadcastAndJoin( buildTree(flattenRddByRegions().map(f => (f._1.pad(flankSize), f._2))), - genomicRdd.flattenRddByRegions()), - sequences ++ genomicRdd.sequences, + genomicDataset.flattenRddByRegions()), + sequences ++ genomicDataset.sequences, GenericConverter[(Iterable[T], X), (Seq[U], Y)](kv => { // pad by -1 * flankSize to undo pad from preprocessing Seq(kv._1.map(v => getReferenceRegions(v) .map(_.pad(-1 * flankSize)))).flatten.flatten ++ - genomicRdd.getReferenceRegions(kv._2) + genomicDataset.getReferenceRegions(kv._2) }, - kv => (kv._1.map(productFn).toSeq, genomicRdd.productFn(kv._2)), - kv => (kv._1.map(unproductFn), genomicRdd.unproductFn(kv._2))), + kv => (kv._1.map(productFn).toSeq, genomicDataset.productFn(kv._2)), + kv => (kv._1.map(unproductFn), genomicDataset.unproductFn(kv._2))), TagHolder[(Iterable[T], X), (Seq[U], Y)]()) } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * * In a broadcast join, the left side of the join (broadcastTree) is broadcast to * to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this * is a right outer join, all values in the left table that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does * not overlap any values in the left table, it will be paired with a `None` * in the product of the join. As compared to broadcastRegionJoin, this * function allows the broadcast object to be reused across multiple joins. * - * @note This function differs from other region joins as it treats the calling RDD + * @note This function differs from other region joins as it treats the calling genomic dataset * as the right side of the join, and not the left. * - * @param broadcastTree The data on the left side of the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param broadcast The data on the left side of the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. * * @see rightOuterBroadcastRegionJoinAndGroupByRight @@ -1712,60 +1712,60 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Performs a broadcast right outer join between this RDD and another RDD. + * Performs a broadcast right outer join between this genomic dataset and another genomic dataset. * * In a broadcast join, the left side of the join (broadcastTree) is broadcast to * to all the nodes in the cluster. The key equality function * used for this join is the reference region overlap function. Since this - * is a right outer join, all values in the left RDD that do not overlap a - * value from the right RDD are dropped. If a value from the right RDD does - * not overlap any values in the left RDD, it will be paired with a `None` + * is a right outer join, all values in the left genomic dataset that do not overlap a + * value from the right genomic dataset are dropped. If a value from the right genomic dataset does + * not overlap any values in the left genomic dataset, it will be paired with a `None` * in the product of the join. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. * * @see rightOuterBroadcastRegionJoinAgainstAndGroupByRight */ def rightOuterBroadcastRegionJoinAndGroupByRight[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], itxTag: ClassTag[(Iterable[T], X)], iuyTag: TypeTag[(Seq[U], Y)]): GenericGenomicDataset[(Iterable[T], X), (Seq[U], Y)] = { - rightOuterBroadcastRegionJoinAndGroupByRight(genomicRdd, 0L) + rightOuterBroadcastRegionJoinAndGroupByRight(genomicDataset, 0L) } /** - * Prepares two RDDs to be joined with any shuffleRegionJoin. This includes copartition - * and sort of the rightRdd if necessary. + * Prepares two genomic datasets to be joined with any shuffleRegionJoin. This includes copartition + * and sort of the right genomic dataset if necessary. * - * @param genomicRdd The RDD to join to. + * @param genomicDataset The genomic dataset to join to. * @param optPartitions Optionally sets the number of output partitions. If - * None, the number of partitions on the resulting RDD does not change. + * None, the number of partitions on the resulting genomic dataset does not change. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. * @return a case class containing all the prepared data for ShuffleRegionJoins */ private def prepareForShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], optPartitions: Option[Int] = None, flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X]): (RDD[(ReferenceRegion, T)], RDD[(ReferenceRegion, X)]) = { val partitions = optPartitions.getOrElse(this.rdd.partitions.length) - val (leftRdd, rightRdd) = (isSorted, genomicRdd.isSorted) match { - case (true, _) => (this, genomicRdd.copartitionByReferenceRegion(this, flankSize)) - case (false, true) => (copartitionByReferenceRegion(genomicRdd, flankSize), genomicRdd) + val (leftRdd, rightRdd) = (isSorted, genomicDataset.isSorted) match { + case (true, _) => (this, genomicDataset.copartitionByReferenceRegion(this, flankSize)) + case (false, true) => (copartitionByReferenceRegion(genomicDataset, flankSize), genomicDataset) case (false, false) => { val repartitionedRdd = sortLexicographically(storePartitionMap = true, partitions = partitions) - (repartitionedRdd, genomicRdd.copartitionByReferenceRegion(repartitionedRdd, flankSize)) + (repartitionedRdd, genomicDataset.copartitionByReferenceRegion(repartitionedRdd, flankSize)) } } (leftRdd.flattenRddByRegions().map(f => (f._1.pad(flankSize), f._2)), @@ -1773,76 +1773,76 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } /** - * Performs a sort-merge inner join between this RDD and another RDD. + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is an inner join, all values who do not - * overlap a value from the other RDD are dropped. SparkR friendly variant. + * overlap a value from the other genomic dataset are dropped. SparkR friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. */ def shuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(T, X), (U, Y)] = { - shuffleRegionJoin(genomicRdd, flankSize.toInt: java.lang.Integer) + shuffleRegionJoin(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a sort-merge inner join between this RDD and another RDD. + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is an inner join, all values who do not - * overlap a value from the other RDD are dropped. PySpark/Java friendly + * overlap a value from the other genomic dataset are dropped. PySpark/Java friendly * variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. */ def shuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(T, X), (U, Y)] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(T, X)]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(U, Y)] - shuffleRegionJoin(genomicRdd, flankSize.toLong) + shuffleRegionJoin(genomicDataset, flankSize.toLong) } /** - * Performs a sort-merge inner join between this RDD and another RDD. + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is an inner join, all values who do not - * overlap a value from the other RDD are dropped. + * overlap a value from the other genomic dataset are dropped. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param optPartitions Optionally sets the number of output partitions. If - * None, the number of partitions on the resulting RDD does not change. + * None, the number of partitions on the resulting genomic dataset does not change. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. */ private[rdd] def shuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], optPartitions: Option[Int], flankSize: Long)( implicit tTag: ClassTag[T], @@ -1851,10 +1851,10 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg uyTag: TypeTag[(U, Y)]): GenericGenomicDataset[(T, X), (U, Y)] = InnerShuffleJoin.time { val (leftRddToJoin, rightRddToJoin) = - prepareForShuffleRegionJoin(genomicRdd, optPartitions, flankSize) + prepareForShuffleRegionJoin(genomicDataset, optPartitions, flankSize) // what sequences do we wind up with at the end? - val combinedSequences = sequences ++ genomicRdd.sequences + val combinedSequences = sequences ++ genomicDataset.sequences RDDBoundGenericGenomicDataset[(T, X), (U, Y)]( InnerShuffleRegionJoin[T, X](leftRddToJoin, rightRddToJoin) @@ -1863,143 +1863,143 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg GenericConverter[(T, X), (U, Y)](kv => { // pad by -1 * flankSize to undo pad from preprocessing getReferenceRegions(kv._1).map(_.pad(-1 * flankSize)) ++ - genomicRdd.getReferenceRegions(kv._2) + genomicDataset.getReferenceRegions(kv._2) }, - kv => (productFn(kv._1), genomicRdd.productFn(kv._2)), - kv => (unproductFn(kv._1), genomicRdd.unproductFn(kv._2))), + kv => (productFn(kv._1), genomicDataset.productFn(kv._2)), + kv => (unproductFn(kv._1), genomicDataset.unproductFn(kv._2))), TagHolder[(T, X), (U, Y)]()) } /** - * Performs a sort-merge inner join between this RDD and another RDD. + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is an inner join, all values who do not - * overlap a value from the other RDD are dropped. + * overlap a value from the other genomic dataset are dropped. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. */ def shuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], txTag: ClassTag[(T, X)], uyTag: TypeTag[(U, Y)]): GenericGenomicDataset[(T, X), (U, Y)] = { - shuffleRegionJoin(genomicRdd, None, flankSize) + shuffleRegionJoin(genomicDataset, None, flankSize) } /** - * Performs a sort-merge inner join between this RDD and another RDD. + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is an inner join, all values who do not - * overlap a value from the other RDD are dropped. + * overlap a value from the other genomic dataset are dropped. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space. */ def shuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], txTag: ClassTag[(T, X)], uyTag: TypeTag[(U, Y)]): GenericGenomicDataset[(T, X), (U, Y)] = { - shuffleRegionJoin(genomicRdd, None, 0L) + shuffleRegionJoin(genomicDataset, None, 0L) } /** - * Performs a sort-merge right outer join between this RDD and another RDD. + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a right outer join, all values in the - * left RDD that do not overlap a value from the right RDD are dropped. - * If a value from the right RDD does not overlap any values in the left - * RDD, it will be paired with a `None` in the product of the join. SparkR + * left genomic dataset that do not overlap a value from the right genomic dataset are dropped. + * If a value from the right genomic dataset does not overlap any values in the left + * genomic dataset, it will be paired with a `None` in the product of the join. SparkR * friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. */ def rightOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(Option[T], X), (Option[U], Y)] = { - rightOuterShuffleRegionJoin(genomicRdd, flankSize.toInt: java.lang.Integer) + rightOuterShuffleRegionJoin(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a sort-merge right outer join between this RDD and another RDD. + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a right outer join, all values in the - * left RDD that do not overlap a value from the right RDD are dropped. - * If a value from the right RDD does not overlap any values in the left - * RDD, it will be paired with a `None` in the product of the join. + * left genomic dataset that do not overlap a value from the right genomic dataset are dropped. + * If a value from the right genomic dataset does not overlap any values in the left + * genomic dataset, it will be paired with a `None` in the product of the join. * PySpark/Java friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. */ def rightOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(Option[T], X), (Option[U], Y)] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(Option[T], X)]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(Option[U], Y)] - rightOuterShuffleRegionJoin(genomicRdd, flankSize.toLong) + rightOuterShuffleRegionJoin(genomicDataset, flankSize.toLong) } /** - * Performs a sort-merge right outer join between this RDD and another RDD. + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a right outer join, all values in the - * left RDD that do not overlap a value from the right RDD are dropped. - * If a value from the right RDD does not overlap any values in the left - * RDD, it will be paired with a `None` in the product of the join. + * left genomic dataset that do not overlap a value from the right genomic dataset are dropped. + * If a value from the right genomic dataset does not overlap any values in the left + * genomic dataset, it will be paired with a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param optPartitions Optionally sets the number of output partitions. If - * None, the number of partitions on the resulting RDD does not change. + * None, the number of partitions on the resulting genomic dataset does not change. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. */ private[rdd] def rightOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], optPartitions: Option[Int], flankSize: Long)( implicit tTag: ClassTag[T], @@ -2008,10 +2008,10 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg ouyTag: TypeTag[(Option[U], Y)]): GenericGenomicDataset[(Option[T], X), (Option[U], Y)] = RightOuterShuffleJoin.time { val (leftRddToJoin, rightRddToJoin) = - prepareForShuffleRegionJoin(genomicRdd, optPartitions, flankSize) + prepareForShuffleRegionJoin(genomicDataset, optPartitions, flankSize) // what sequences do we wind up with at the end? - val combinedSequences = sequences ++ genomicRdd.sequences + val combinedSequences = sequences ++ genomicDataset.sequences RDDBoundGenericGenomicDataset[(Option[T], X), (Option[U], Y)]( LeftOuterShuffleRegionJoin[X, T](rightRddToJoin, leftRddToJoin) @@ -2022,149 +2022,149 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg // pad by -1 * flankSize to undo pad from preprocessing Seq(kv._1.map(v => getReferenceRegions(v) .map(_.pad(-1 * flankSize)))).flatten.flatten ++ - genomicRdd.getReferenceRegions(kv._2) + genomicDataset.getReferenceRegions(kv._2) }, - kv => (kv._1.map(productFn), genomicRdd.productFn(kv._2)), - kv => (kv._1.map(unproductFn), genomicRdd.unproductFn(kv._2))), + kv => (kv._1.map(productFn), genomicDataset.productFn(kv._2)), + kv => (kv._1.map(unproductFn), genomicDataset.unproductFn(kv._2))), TagHolder[(Option[T], X), (Option[U], Y)]()) } /** - * Performs a sort-merge right outer join between this RDD and another RDD. + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a right outer join, all values in the - * left RDD that do not overlap a value from the right RDD are dropped. - * If a value from the right RDD does not overlap any values in the left - * RDD, it will be paired with a `None` in the product of the join. + * left genomic dataset that do not overlap a value from the right genomic dataset are dropped. + * If a value from the right genomic dataset does not overlap any values in the left + * genomic dataset, it will be paired with a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. */ def rightOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], otxTag: ClassTag[(Option[T], X)], ouyTag: TypeTag[(Option[U], Y)]): GenericGenomicDataset[(Option[T], X), (Option[U], Y)] = { - rightOuterShuffleRegionJoin(genomicRdd, None, flankSize) + rightOuterShuffleRegionJoin(genomicDataset, None, flankSize) } /** - * Performs a sort-merge right outer join between this RDD and another RDD. + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a right outer join, all values in the - * left RDD that do not overlap a value from the right RDD are dropped. - * If a value from the right RDD does not overlap any values in the left - * RDD, it will be paired with a `None` in the product of the join. + * left genomic dataset that do not overlap a value from the right genomic dataset are dropped. + * If a value from the right genomic dataset does not overlap any values in the left + * genomic dataset, it will be paired with a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * right RDD that did not overlap a key in the left RDD. + * right genomic dataset that did not overlap a key in the left genomic dataset. */ def rightOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], otxTag: ClassTag[(Option[T], X)], ouyTag: TypeTag[(Option[U], Y)]): GenericGenomicDataset[(Option[T], X), (Option[U], Y)] = { - rightOuterShuffleRegionJoin(genomicRdd, None, 0L) + rightOuterShuffleRegionJoin(genomicDataset, None, 0L) } /** - * Performs a sort-merge left outer join between this RDD and another RDD. + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with a `None` in the product of the join. SparkR + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with a `None` in the product of the join. SparkR * friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ def leftOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(T, Option[X]), (U, Option[Y])] = { - leftOuterShuffleRegionJoin(genomicRdd, flankSize.toInt: java.lang.Integer) + leftOuterShuffleRegionJoin(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a sort-merge left outer join between this RDD and another RDD. + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with a `None` in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with a `None` in the product of the join. * PySpark/Java friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ def leftOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(T, Option[X]), (U, Option[Y])] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(T, Option[X])]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(U, Option[Y])] - leftOuterShuffleRegionJoin(genomicRdd, flankSize.toLong) + leftOuterShuffleRegionJoin(genomicDataset, flankSize.toLong) } /** - * Performs a sort-merge left outer join between this RDD and another RDD. + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with a `None` in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param optPartitions Optionally sets the number of output partitions. If - * None, the number of partitions on the resulting RDD does not change. + * None, the number of partitions on the resulting genomic dataset does not change. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ private[rdd] def leftOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], optPartitions: Option[Int], flankSize: Long)( implicit tTag: ClassTag[T], @@ -2173,10 +2173,10 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg uoyTag: TypeTag[(U, Option[Y])]): GenericGenomicDataset[(T, Option[X]), (U, Option[Y])] = LeftOuterShuffleJoin.time { val (leftRddToJoin, rightRddToJoin) = - prepareForShuffleRegionJoin(genomicRdd, optPartitions, flankSize) + prepareForShuffleRegionJoin(genomicDataset, optPartitions, flankSize) // what sequences do we wind up with at the end? - val combinedSequences = sequences ++ genomicRdd.sequences + val combinedSequences = sequences ++ genomicDataset.sequences RDDBoundGenericGenomicDataset[(T, Option[X]), (U, Option[Y])]( LeftOuterShuffleRegionJoin[T, X](leftRddToJoin, rightRddToJoin) @@ -2185,152 +2185,152 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg GenericConverter[(T, Option[X]), (U, Option[Y])](kv => { // pad by -1 * flankSize to undo pad from preprocessing getReferenceRegions(kv._1).map(_.pad(-1 * flankSize)) ++ - Seq(kv._2.map(v => genomicRdd.getReferenceRegions(v))).flatten.flatten + Seq(kv._2.map(v => genomicDataset.getReferenceRegions(v))).flatten.flatten }, - kv => (productFn(kv._1), kv._2.map(genomicRdd.productFn)), - kv => (unproductFn(kv._1), kv._2.map(genomicRdd.unproductFn))), + kv => (productFn(kv._1), kv._2.map(genomicDataset.productFn)), + kv => (unproductFn(kv._1), kv._2.map(genomicDataset.unproductFn))), TagHolder[(T, Option[X]), (U, Option[Y])]()) } /** - * Performs a sort-merge left outer join between this RDD and another RDD. + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with a `None` in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ def leftOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], toxTag: ClassTag[(T, Option[X])], uoyTag: TypeTag[(U, Option[Y])]): GenericGenomicDataset[(T, Option[X]), (U, Option[Y])] = { - leftOuterShuffleRegionJoin(genomicRdd, None, flankSize) + leftOuterShuffleRegionJoin(genomicDataset, None, flankSize) } /** - * Performs a sort-merge left outer join between this RDD and another RDD. + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with a `None` in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ def leftOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], toxTag: ClassTag[(T, Option[X])], uoyTag: TypeTag[(U, Option[Y])]): GenericGenomicDataset[(T, Option[X]), (U, Option[Y])] = { - leftOuterShuffleRegionJoin(genomicRdd, None, 0L) + leftOuterShuffleRegionJoin(genomicDataset, None, 0L) } /** - * Performs a sort-merge left outer join between this RDD and another RDD, + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with an empty Iterable in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with an empty Iterable in the product of the join. * SparkR friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ def leftOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = { - leftOuterShuffleRegionJoinAndGroupByLeft(genomicRdd, flankSize.toInt: java.lang.Integer) + leftOuterShuffleRegionJoinAndGroupByLeft(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a sort-merge left outer join between this RDD and another RDD, + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with an empty Iterable in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with an empty Iterable in the product of the join. * PySpark/Java friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ def leftOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(T, Iterable[X])]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(U, Seq[Y])] - leftOuterShuffleRegionJoinAndGroupByLeft(genomicRdd, flankSize.toLong) + leftOuterShuffleRegionJoinAndGroupByLeft(genomicDataset, flankSize.toLong) } /** - * Performs a sort-merge left outer join between this RDD and another RDD, + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with an empty Iterable in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with an empty Iterable in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param optPartitions Optionally sets the number of output partitions. If - * None, the number of partitions on the resulting RDD does not change. + * None, the number of partitions on the resulting genomic dataset does not change. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ private[rdd] def leftOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], optPartitions: Option[Int], flankSize: Long)( implicit tTag: ClassTag[T], @@ -2339,10 +2339,10 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg uiyTag: TypeTag[(U, Seq[Y])]): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = LeftOuterShuffleJoin.time { val (leftRddToJoin, rightRddToJoin) = - prepareForShuffleRegionJoin(genomicRdd, optPartitions, flankSize) + prepareForShuffleRegionJoin(genomicDataset, optPartitions, flankSize) // what sequences do we wind up with at the end? - val combinedSequences = sequences ++ genomicRdd.sequences + val combinedSequences = sequences ++ genomicDataset.sequences RDDBoundGenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])]( LeftOuterShuffleRegionJoinAndGroupByLeft[T, X](leftRddToJoin, rightRddToJoin) @@ -2351,146 +2351,146 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg GenericConverter[(T, Iterable[X]), (U, Seq[Y])](kv => { // pad by -1 * flankSize to undo flank from preprocessing getReferenceRegions(kv._1).map(_.pad(-1 * flankSize)) ++ - Seq(kv._2.map(v => genomicRdd.getReferenceRegions(v))).flatten.flatten + Seq(kv._2.map(v => genomicDataset.getReferenceRegions(v))).flatten.flatten }, - kv => (productFn(kv._1), kv._2.map(genomicRdd.productFn).toSeq), - kv => (unproductFn(kv._1), kv._2.map(genomicRdd.unproductFn))), + kv => (productFn(kv._1), kv._2.map(genomicDataset.productFn).toSeq), + kv => (unproductFn(kv._1), kv._2.map(genomicDataset.unproductFn))), TagHolder[(T, Iterable[X]), (U, Seq[Y])]()) } /** - * Performs a sort-merge left outer join between this RDD and another RDD, + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with an empty Iterable in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with an empty Iterable in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ def leftOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], toxTag: ClassTag[(T, Iterable[X])], uiyTag: TypeTag[(U, Seq[Y])]): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = { - leftOuterShuffleRegionJoinAndGroupByLeft(genomicRdd, None, flankSize) + leftOuterShuffleRegionJoinAndGroupByLeft(genomicDataset, None, flankSize) } /** - * Performs a sort-merge left outer join between this RDD and another RDD, + * Performs a sort-merge left outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a left outer join, all values in the - * right RDD that do not overlap a value from the left RDD are dropped. - * If a value from the left RDD does not overlap any values in the right - * RDD, it will be paired with an empty Iterable in the product of the join. + * right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + * If a value from the left genomic dataset does not overlap any values in the right + * genomic dataset, it will be paired with an empty Iterable in the product of the join. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and all keys from the - * left RDD that did not overlap a key in the right RDD. + * left genomic dataset that did not overlap a key in the right genomic dataset. */ def leftOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], toxTag: ClassTag[(T, Iterable[X])], uiyTag: TypeTag[(U, Seq[Y])]): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = { - leftOuterShuffleRegionJoinAndGroupByLeft(genomicRdd, None, 0L) + leftOuterShuffleRegionJoinAndGroupByLeft(genomicDataset, None, 0L) } /** - * Performs a sort-merge full outer join between this RDD and another RDD. + * Performs a sort-merge full outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a full outer join, if a value from either - * RDD does not overlap any values in the other RDD, it will be paired with + * genomic dataset does not overlap any values in the other genomic dataset, it will be paired with * a `None` in the product of the join. SparkR friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and values that did not * overlap will be paired with a `None`. */ def fullOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(Option[T], Option[X]), (Option[U], Option[Y])] = { - fullOuterShuffleRegionJoin(genomicRdd, flankSize.toInt: java.lang.Integer) + fullOuterShuffleRegionJoin(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a sort-merge full outer join between this RDD and another RDD. + * Performs a sort-merge full outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a full outer join, if a value from either - * RDD does not overlap any values in the other RDD, it will be paired with + * genomic dataset does not overlap any values in the other genomic dataset, it will be paired with * a `None` in the product of the join. PySpark/Java friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and values that did not * overlap will be paired with a `None`. */ def fullOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(Option[T], Option[X]), (Option[U], Option[Y])] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(Option[T], Option[X])]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(Option[U], Option[Y])] - fullOuterShuffleRegionJoin(genomicRdd, flankSize.toLong) + fullOuterShuffleRegionJoin(genomicDataset, flankSize.toLong) } /** - * Performs a sort-merge full outer join between this RDD and another RDD. + * Performs a sort-merge full outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a full outer join, if a value from either - * RDD does not overlap any values in the other RDD, it will be paired with + * genomic dataset does not overlap any values in the other genomic dataset, it will be paired with * a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param optPartitions Optionally sets the number of output partitions. If - * None, the number of partitions on the resulting RDD does not change. + * None, the number of partitions on the resulting genomic dataset does not change. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and values that did not * overlap will be paired with a `None`. */ private[rdd] def fullOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], optPartitions: Option[Int], flankSize: Long)( implicit tTag: ClassTag[T], @@ -2499,10 +2499,10 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg ouoyTag: TypeTag[(Option[U], Option[Y])]): GenericGenomicDataset[(Option[T], Option[X]), (Option[U], Option[Y])] = FullOuterShuffleJoin.time { val (leftRddToJoin, rightRddToJoin) = - prepareForShuffleRegionJoin(genomicRdd, optPartitions, flankSize) + prepareForShuffleRegionJoin(genomicDataset, optPartitions, flankSize) // what sequences do we wind up with at the end? - val combinedSequences = sequences ++ genomicRdd.sequences + val combinedSequences = sequences ++ genomicDataset.sequences RDDBoundGenericGenomicDataset[(Option[T], Option[X]), (Option[U], Option[Y])]( FullOuterShuffleRegionJoin[T, X](leftRddToJoin, rightRddToJoin) @@ -2511,143 +2511,143 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg GenericConverter[(Option[T], Option[X]), (Option[U], Option[Y])](kv => { // pad by -1 * flankSize to undo pad from preprocessing Seq(kv._1.map(v => getReferenceRegions(v).map(_.pad(-1 * flankSize))), - kv._2.map(v => genomicRdd.getReferenceRegions(v))).flatten.flatten + kv._2.map(v => genomicDataset.getReferenceRegions(v))).flatten.flatten }, - kv => (kv._1.map(productFn), kv._2.map(genomicRdd.productFn)), - kv => (kv._1.map(unproductFn), kv._2.map(genomicRdd.unproductFn))), + kv => (kv._1.map(productFn), kv._2.map(genomicDataset.productFn)), + kv => (kv._1.map(unproductFn), kv._2.map(genomicDataset.unproductFn))), TagHolder[(Option[T], Option[X]), (Option[U], Option[Y])]()) } /** - * Performs a sort-merge full outer join between this RDD and another RDD. + * Performs a sort-merge full outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a full outer join, if a value from either - * RDD does not overlap any values in the other RDD, it will be paired with + * genomic dataset does not overlap any values in the other genomic dataset, it will be paired with * a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and values that did not * overlap will be paired with a `None`. */ def fullOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], otoxTag: ClassTag[(Option[T], Option[X])], ouoyTag: TypeTag[(Option[U], Option[Y])]): GenericGenomicDataset[(Option[T], Option[X]), (Option[U], Option[Y])] = { - fullOuterShuffleRegionJoin(genomicRdd, None, flankSize) + fullOuterShuffleRegionJoin(genomicDataset, None, flankSize) } /** - * Performs a sort-merge full outer join between this RDD and another RDD. + * Performs a sort-merge full outer join between this genomic dataset and another genomic dataset. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is a full outer join, if a value from either - * RDD does not overlap any values in the other RDD, it will be paired with + * genomic dataset does not overlap any values in the other genomic dataset, it will be paired with * a `None` in the product of the join. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, and values that did not * overlap will be paired with a `None`. */ def fullOuterShuffleRegionJoin[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], otoxTag: ClassTag[(Option[T], Option[X])], ouoyTag: TypeTag[(Option[U], Option[Y])]): GenericGenomicDataset[(Option[T], Option[X]), (Option[U], Option[Y])] = { - fullOuterShuffleRegionJoin(genomicRdd, None, 0L) + fullOuterShuffleRegionJoin(genomicDataset, None, 0L) } /** - * Performs a sort-merge inner join between this RDD and another RDD, + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. In the same operation, we group all values by the left - * item in the RDD. SparkR friendly variant. + * item in the genomic dataset. SparkR friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD. + * the value they overlapped in the left genomic dataset. */ def shuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = { - shuffleRegionJoinAndGroupByLeft(genomicRdd, flankSize.toInt: java.lang.Integer) + shuffleRegionJoinAndGroupByLeft(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a sort-merge inner join between this RDD and another RDD, + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. In the same operation, we group all values by the left - * item in the RDD. PySpark/Java friendly variant. + * item in the genomic dataset. PySpark/Java friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD. + * the value they overlapped in the left genomic dataset. */ def shuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(T, Iterable[X])]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(U, Seq[Y])] - shuffleRegionJoinAndGroupByLeft(genomicRdd, flankSize.toLong) + shuffleRegionJoinAndGroupByLeft(genomicDataset, flankSize.toLong) } /** - * Performs a sort-merge inner join between this RDD and another RDD, + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is an inner join, all values who do not - * overlap a value from the other RDD are dropped. In the same operation, - * we group all values by the left item in the RDD. + * overlap a value from the other genomic dataset are dropped. In the same operation, + * we group all values by the left item in the genomic dataset. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param optPartitions Optionally sets the number of output partitions. If - * None, the number of partitions on the resulting RDD does not change. + * None, the number of partitions on the resulting genomic dataset does not change. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD.. + * the value they overlapped in the left genomic dataset.. */ private[rdd] def shuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], optPartitions: Option[Int], flankSize: Long)( implicit tTag: ClassTag[T], @@ -2656,10 +2656,10 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg uiyTag: TypeTag[(U, Seq[Y])]): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = ShuffleJoinAndGroupByLeft.time { val (leftRddToJoin, rightRddToJoin) = - prepareForShuffleRegionJoin(genomicRdd, optPartitions, flankSize) + prepareForShuffleRegionJoin(genomicDataset, optPartitions, flankSize) // what sequences do we wind up with at the end? - val combinedSequences = sequences ++ genomicRdd.sequences + val combinedSequences = sequences ++ genomicDataset.sequences RDDBoundGenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])]( InnerShuffleRegionJoinAndGroupByLeft[T, X](leftRddToJoin, rightRddToJoin) @@ -2669,153 +2669,153 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg // pad by -1 * flankSize to undo pad from preprocessing getReferenceRegions(kv._1) .map(_.pad(-1 * flankSize)) ++ - kv._2.flatMap(v => genomicRdd.getReferenceRegions(v)) + kv._2.flatMap(v => genomicDataset.getReferenceRegions(v)) }, - kv => (productFn(kv._1), kv._2.map(genomicRdd.productFn).toSeq), - kv => (unproductFn(kv._1), kv._2.map(genomicRdd.unproductFn))), + kv => (productFn(kv._1), kv._2.map(genomicDataset.productFn).toSeq), + kv => (unproductFn(kv._1), kv._2.map(genomicDataset.unproductFn))), TagHolder[(T, Iterable[X]), (U, Seq[Y])]()) } /** - * Performs a sort-merge inner join between this RDD and another RDD, + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is an inner join, all values who do not - * overlap a value from the other RDD are dropped. In the same operation, - * we group all values by the left item in the RDD. + * overlap a value from the other genomic dataset are dropped. In the same operation, + * we group all values by the left item in the genomic dataset. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD.. + * the value they overlapped in the left genomic dataset. */ def shuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], tixTag: ClassTag[(T, Iterable[X])], uiyTag: TypeTag[(U, Seq[Y])]): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = { - shuffleRegionJoinAndGroupByLeft(genomicRdd, None, flankSize) + shuffleRegionJoinAndGroupByLeft(genomicDataset, None, flankSize) } /** - * Performs a sort-merge inner join between this RDD and another RDD, + * Performs a sort-merge inner join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. Since this is an inner join, all values who do not - * overlap a value from the other RDD are dropped. In the same operation, - * we group all values by the left item in the RDD. + * overlap a value from the other genomic dataset are dropped. In the same operation, + * we group all values by the left item in the genomic dataset. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD.. + * the value they overlapped in the left genomic dataset. */ def shuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], tixTag: ClassTag[(T, Iterable[X])], uiyTag: TypeTag[(U, Seq[Y])]): GenericGenomicDataset[(T, Iterable[X]), (U, Seq[Y])] = { - shuffleRegionJoinAndGroupByLeft(genomicRdd, None, 0L) + shuffleRegionJoinAndGroupByLeft(genomicDataset, None, 0L) } /** - * Performs a sort-merge right outer join between this RDD and another RDD, + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value, if not null. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. In the same operation, we group all values by the left - * item in the RDD. Since this is a right outer join, all values from the - * right RDD who did not overlap a value from the left RDD are placed into + * item in the genomic dataset. Since this is a right outer join, all values from the + * right genomic dataset who did not overlap a value from the left genomic dataset are placed into * a length-1 Iterable with a `None` key. SparkR friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD, and all values from the - * right RDD that did not overlap an item in the left RDD. + * the value they overlapped in the left genomic dataset, and all values from the + * right genomic dataset that did not overlap an item in the left genomic dataset. */ def rightOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Double): GenericGenomicDataset[(Option[T], Iterable[X]), (Option[U], Seq[Y])] = { - rightOuterShuffleRegionJoinAndGroupByLeft(genomicRdd, flankSize.toInt: java.lang.Integer) + rightOuterShuffleRegionJoinAndGroupByLeft(genomicDataset, flankSize.toInt: java.lang.Integer) } /** - * Performs a sort-merge right outer join between this RDD and another RDD, + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value, if not null. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. In the same operation, we group all values by the left - * item in the RDD. Since this is a right outer join, all values from the - * right RDD who did not overlap a value from the left RDD are placed into + * item in the genomic dataset. Since this is a right outer join, all values from the + * right genomic dataset who did not overlap a value from the left genomic dataset are placed into * a length-1 Iterable with a `None` key. PySpark/Java friendly variant. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD, and all values from the - * right RDD that did not overlap an item in the left RDD. + * the value they overlapped in the left genomic dataset, and all values from the + * right genomic dataset that did not overlap an item in the left genomic dataset. */ def rightOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: java.lang.Integer): GenericGenomicDataset[(Option[T], Iterable[X]), (Option[U], Seq[Y])] = { implicit val tTag = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] implicit val xTag = ClassTag.AnyRef.asInstanceOf[ClassTag[X]] implicit val txTag = ClassTag.AnyRef.asInstanceOf[ClassTag[(Option[T], Iterable[X])]] implicit val u1Tag: TypeTag[U] = uTag - implicit val u2Tag: TypeTag[Y] = genomicRdd.uTag + implicit val u2Tag: TypeTag[Y] = genomicDataset.uTag implicit val uyTag = typeTag[(Option[U], Seq[Y])] - rightOuterShuffleRegionJoinAndGroupByLeft(genomicRdd, flankSize.toLong) + rightOuterShuffleRegionJoinAndGroupByLeft(genomicDataset, flankSize.toLong) } /** - * Performs a sort-merge right outer join between this RDD and another RDD, + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value, if not null. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. In the same operation, we group all values by the left - * item in the RDD. Since this is a right outer join, all values from the - * right RDD who did not overlap a value from the left RDD are placed into + * item in the genomic dataset. Since this is a right outer join, all values from the + * right genomic dataset who did not overlap a value from the left genomic dataset are placed into * a length-1 Iterable with a `None` key. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param optPartitions Optionally sets the number of output partitions. If - * None, the number of partitions on the resulting RDD does not change. + * None, the number of partitions on the resulting genomic dataset does not change. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD, and all values from the - * right RDD that did not overlap an item in the left RDD. + * the value they overlapped in the left genomic dataset, and all values from the + * right genomic dataset that did not overlap an item in the left genomic dataset. */ private[rdd] def rightOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], optPartitions: Option[Int], flankSize: Long)( implicit tTag: ClassTag[T], @@ -2824,10 +2824,10 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg ouiyTag: TypeTag[(Option[U], Seq[Y])]): GenericGenomicDataset[(Option[T], Iterable[X]), (Option[U], Seq[Y])] = RightOuterShuffleJoinAndGroupByLeft.time { val (leftRddToJoin, rightRddToJoin) = - prepareForShuffleRegionJoin(genomicRdd, optPartitions, flankSize) + prepareForShuffleRegionJoin(genomicDataset, optPartitions, flankSize) // what sequences do we wind up with at the end? - val combinedSequences = sequences ++ genomicRdd.sequences + val combinedSequences = sequences ++ genomicDataset.sequences RDDBoundGenericGenomicDataset[(Option[T], Iterable[X]), (Option[U], Seq[Y])]( RightOuterShuffleRegionJoinAndGroupByLeft[T, X](leftRddToJoin, rightRddToJoin) @@ -2837,93 +2837,93 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg // pad by -1 * flankSize to undo pad from preprocessing kv._1.toSeq.flatMap(v => getReferenceRegions(v) .map(_.pad(-1 * flankSize))) ++ - kv._2.flatMap(v => genomicRdd.getReferenceRegions(v)) + kv._2.flatMap(v => genomicDataset.getReferenceRegions(v)) }, - kv => (kv._1.map(productFn), kv._2.map(genomicRdd.productFn).toSeq), - kv => (kv._1.map(unproductFn), kv._2.map(genomicRdd.unproductFn))), + kv => (kv._1.map(productFn), kv._2.map(genomicDataset.productFn).toSeq), + kv => (kv._1.map(unproductFn), kv._2.map(genomicDataset.unproductFn))), TagHolder[(Option[T], Iterable[X]), (Option[U], Seq[Y])]()) } /** - * Performs a sort-merge right outer join between this RDD and another RDD, + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value, if not null. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. In the same operation, we group all values by the left - * item in the RDD. Since this is a right outer join, all values from the - * right RDD who did not overlap a value from the left RDD are placed into + * item in the genomic dataset. Since this is a right outer join, all values from the + * right genomic dataset who did not overlap a value from the left genomic dataset are placed into * a length-1 Iterable with a `None` key. * - * @param genomicRdd The right RDD in the join. + * @param genomicDataset The right genomic dataset in the join. * @param flankSize Sets a flankSize for the distance between elements to be * joined. If set to 0, an overlap is required to join two elements. - * @return Returns a new genomic RDD containing all pairs of keys that + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD, and all values from the - * right RDD that did not overlap an item in the left RDD. + * the value they overlapped in the left genomic dataset, and all values from the + * right genomic dataset that did not overlap an item in the left genomic dataset. */ def rightOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z], + genomicDataset: GenomicDataset[X, Y, Z], flankSize: Long)( implicit tTag: ClassTag[T], xTag: ClassTag[X], otixTag: ClassTag[(Option[T], Iterable[X])], ousyTag: TypeTag[(Option[U], Seq[Y])]): GenericGenomicDataset[(Option[T], Iterable[X]), (Option[U], Seq[Y])] = { - rightOuterShuffleRegionJoinAndGroupByLeft(genomicRdd, None, flankSize) + rightOuterShuffleRegionJoinAndGroupByLeft(genomicDataset, None, flankSize) } /** - * Performs a sort-merge right outer join between this RDD and another RDD, + * Performs a sort-merge right outer join between this genomic dataset and another genomic dataset, * followed by a groupBy on the left value, if not null. * - * In a sort-merge join, both RDDs are co-partitioned and sorted. The + * In a sort-merge join, both genomic datasets are co-partitioned and sorted. The * partitions are then zipped, and we do a merge join on each partition. * The key equality function used for this join is the reference region * overlap function. In the same operation, we group all values by the left - * item in the RDD. Since this is a right outer join, all values from the - * right RDD who did not overlap a value from the left RDD are placed into + * item in the genomic dataset. Since this is a right outer join, all values from the + * right genomic dataset who did not overlap a value from the left genomic dataset are placed into * a length-1 Iterable with a `None` key. * - * @param genomicRdd The right RDD in the join. - * @return Returns a new genomic RDD containing all pairs of keys that + * @param genomicDataset The right genomic dataset in the join. + * @return Returns a new genomic dataset containing all pairs of keys that * overlapped in the genomic coordinate space, grouped together by - * the value they overlapped in the left RDD, and all values from the - * right RDD that did not overlap an item in the left RDD. + * the value they overlapped in the left genomic dataset, and all values from the + * right genomic dataset that did not overlap an item in the left genomic dataset. */ def rightOuterShuffleRegionJoinAndGroupByLeft[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - genomicRdd: GenomicDataset[X, Y, Z])( + genomicDataset: GenomicDataset[X, Y, Z])( implicit tTag: ClassTag[T], xTag: ClassTag[X], otixTag: ClassTag[(Option[T], Iterable[X])], otsyTag: TypeTag[(Option[U], Seq[Y])]): GenericGenomicDataset[(Option[T], Iterable[X]), (Option[U], Seq[Y])] = { - rightOuterShuffleRegionJoinAndGroupByLeft(genomicRdd, None, 0L) + rightOuterShuffleRegionJoinAndGroupByLeft(genomicDataset, None, 0L) } /** - * Copartitions two RDDs according to their ReferenceRegions. + * Copartitions two genomic datasets according to their ReferenceRegions. * * @note This is best used under the condition that (repeatedly) * repartitioning is more expensive than calculating the proper location * of the records of this.rdd. It requires a pass through the co-located - * RDD to get the correct partition(s) for each record. It will assign a + * genomic dataset to get the correct partition(s) for each record. It will assign a * record to multiple partitions if necessary. * - * @param rddToCoPartitionWith The rdd to copartition to. - * @return The newly repartitioned rdd. + * @param datasetToCoPartitionWith The genomic dataset to copartition to. + * @return The newly repartitioned genomic dataset. */ private[rdd] def copartitionByReferenceRegion[X, Y <: Product, Z <: GenomicDataset[X, Y, Z]]( - rddToCoPartitionWith: GenomicDataset[X, Y, Z], + datasetToCoPartitionWith: GenomicDataset[X, Y, Z], flankSize: Long = 0L)(implicit tTag: ClassTag[T], xTag: ClassTag[X]): V = { - // if the other RDD is not sorted, we can't guarantee proper copartition - assert(rddToCoPartitionWith.isSorted, - "Cannot copartition with an unsorted rdd!") + // if the other genomic dataset is not sorted, we can't guarantee proper copartition + assert(datasetToCoPartitionWith.isSorted, + "Cannot copartition with an unsorted genomic dataset!") - val destinationPartitionMap = rddToCoPartitionWith.optPartitionMap.get + val destinationPartitionMap = datasetToCoPartitionWith.optPartitionMap.get // number of partitions we will have after repartition val numPartitions = destinationPartitionMap.length @@ -3033,7 +3033,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg ManualRegionPartitioner(numPartitions)) } - replaceRdd(finalPartitionedRDD.values, rddToCoPartitionWith.optPartitionMap) + replaceRdd(finalPartitionedRDD.values, datasetToCoPartitionWith.optPartitionMap) } /** @@ -3084,7 +3084,7 @@ trait GenomicDataset[T, U <: Product, V <: GenomicDataset[T, U, V]] extends Logg } // we pass these conversion functions back and forth between the various -// generic genomic datset implementations, so it makes sense to bundle +// generic genomic dataset implementations, so it makes sense to bundle // them up in a case class case class GenericConverter[T, U] private (regionFn: T => Seq[ReferenceRegion], productFn: T => U, @@ -3100,7 +3100,7 @@ sealed abstract class GenericGenomicDataset[T, U <: Product] extends GenomicData @transient val uTag: TypeTag[U] - def saveAsParquet(filePath: String, + def saveAsParquet(pathName: String, blockSize: Int = 128 * 1024 * 1024, pageSize: Int = 1 * 1024 * 1024, compressCodec: CompressionCodecName = CompressionCodecName.GZIP, @@ -3110,7 +3110,7 @@ sealed abstract class GenericGenomicDataset[T, U <: Product] extends GenomicData .write .format("parquet") .option("spark.sql.parquet.compression.codec", compressCodec.toString.toLowerCase()) - .save(filePath) + .save(pathName) } protected def buildTree( @@ -3171,10 +3171,10 @@ case class DatasetBoundGenericGenomicDataset[T, U <: Product]( // this cannot be in the GenericGenomicDataset trait due to need for the // implicit classtag - def union(rdds: GenericGenomicDataset[T, U]*): GenericGenomicDataset[T, U] = { - val iterableRdds = rdds.toSeq - RDDBoundGenericGenomicDataset(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), - iterableRdds.map(_.sequences).fold(sequences)(_ ++ _), + def union(datasets: GenericGenomicDataset[T, U]*): GenericGenomicDataset[T, U] = { + val iterableDatasets = datasets.toSeq + RDDBoundGenericGenomicDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _), converter, tagHolder) } @@ -3231,10 +3231,10 @@ case class RDDBoundGenericGenomicDataset[T, U <: Product]( // this cannot be in the GenericGenomicDataset trait due to need for the // implicit classtag - def union(rdds: GenericGenomicDataset[T, U]*): GenericGenomicDataset[T, U] = { - val iterableRdds = rdds.toSeq - RDDBoundGenericGenomicDataset(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), - iterableRdds.map(_.sequences).fold(sequences)(_ ++ _), + def union(datasets: GenericGenomicDataset[T, U]*): GenericGenomicDataset[T, U] = { + val iterableDatasets = datasets.toSeq + RDDBoundGenericGenomicDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _), converter, tagHolder) } @@ -3289,7 +3289,7 @@ trait MultisampleGenomicDataset[T, U <: Product, V <: MultisampleGenomicDataset[ val samples: Seq[Sample] /** - * Replaces the sample metadata attached to the RDD. + * Replaces the sample metadata attached to the genomic dataset. * * @param newSamples The new sample metadata to attach. * @return A GenomicDataset with new sample metadata. @@ -3297,20 +3297,20 @@ trait MultisampleGenomicDataset[T, U <: Product, V <: MultisampleGenomicDataset[ def replaceSamples(newSamples: Iterable[Sample]): V /** - * Adds samples to the current RDD. + * Adds samples to the current genomic dataset. * * @param samplesToAdd Zero or more samples to add. - * @return Returns a new RDD with samples added. + * @return Returns a new genomic dataset with samples added. */ def addSamples(samplesToAdd: Iterable[Sample]): V = { replaceSamples(samples ++ samplesToAdd) } /** - * Adds a single sample to the current RDD. + * Adds a single sample to the current genomic dataset. * * @param sampleToAdd A single sample to add. - * @return Returns a new RDD with this sample added. + * @return Returns a new genomic dataset with this sample added. */ def addSample(sampleToAdd: Sample): V = { addSamples(Seq(sampleToAdd)) @@ -3377,9 +3377,9 @@ trait GenomicDatasetWithLineage[T, U <: Product, V <: GenomicDatasetWithLineage[ val processingSteps: Seq[ProcessingStep] /** - * Replaces the processing steps attached to this RDD. + * Replaces the processing steps attached to this genomic dataset. * - * @param newProcessingSteps The new processing steps to attach to this RDD. + * @param newProcessingSteps The new processing steps to attach to this genomic dataset. * @return Returns a new GenomicDataset with new processing lineage attached. */ def replaceProcessingSteps(newProcessingSteps: Seq[ProcessingStep]): V @@ -3413,7 +3413,7 @@ abstract class AvroRecordGroupGenomicDataset[T <% IndexedRecord: Manifest, U <: val recordGroups: RecordGroupDictionary /** - * Replaces the record groups attached to this RDD. + * Replaces the record groups attached to this genomic dataset. * * @param newRecordGroups The new record group dictionary to attach. * @return Returns a new GenomicDataset with new record groups attached. @@ -3485,10 +3485,10 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti val headerLines: Seq[VCFHeaderLine] /** - * Replaces the header lines attached to this RDD. + * Replaces the header lines attached to this genomic dataset. * - * @param newHeaderLines The new header lines to attach to this RDD. - * @return A new RDD with the header lines replaced. + * @param newHeaderLines The new header lines to attach to this genomic dataset. + * @return A new genomic dataset with the header lines replaced. */ def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): V @@ -3496,7 +3496,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * Appends new header lines to the existing lines. * * @param headerLinesToAdd Zero or more header lines to add. - * @return A new RDD with the new header lines added. + * @return A new genomic dataset with the new header lines added. */ def addHeaderLines(headerLinesToAdd: Seq[VCFHeaderLine]): V = { replaceHeaderLines(headerLines ++ headerLinesToAdd) @@ -3506,7 +3506,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * Appends a new header line to the existing lines. * * @param headerLineToAdd A header line to add. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addHeaderLine(headerLineToAdd: VCFHeaderLine): V = { addHeaderLines(Seq(headerLineToAdd)) @@ -3519,7 +3519,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param count The number of elements in the array. * @param description A description of the data stored in this format field. * @param lineType The type of the data stored in this format field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addFixedArrayFormatHeaderLine(id: String, count: Int, @@ -3537,7 +3537,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param count The number of elements in the array. * @param description A description of the data stored in this format field. * @param lineType The type of the data stored in this format field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addFixedArrayFormatHeaderLine(id: java.lang.String, count: java.lang.Integer, @@ -3552,7 +3552,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param id The identifier for the field. * @param description A description of the data stored in this format field. * @param lineType The type of the data stored in this format field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addScalarFormatHeaderLine(id: String, description: String, @@ -3578,7 +3578,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param id The identifier for the field. * @param description A description of the data stored in this format field. * @param lineType The type of the data stored in this format field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addGenotypeArrayFormatHeaderLine(id: String, description: String, @@ -3598,7 +3598,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param id The identifier for the field. * @param description A description of the data stored in this format field. * @param lineType The type of the data stored in this format field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addAlternateAlleleArrayFormatHeaderLine(id: String, description: String, @@ -3619,7 +3619,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param id The identifier for the field. * @param description A description of the data stored in this format field. * @param lineType The type of the data stored in this format field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addAllAlleleArrayFormatHeaderLine(id: String, description: String, @@ -3637,7 +3637,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param count The number of elements in the array. * @param description A description of the data stored in this info field. * @param lineType The type of the data stored in this info field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addFixedArrayInfoHeaderLine(id: String, count: Int, @@ -3655,7 +3655,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param count The number of elements in the array. * @param description A description of the data stored in this info field. * @param lineType The type of the data stored in this info field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addFixedArrayInfoHeaderLine(id: java.lang.String, count: java.lang.Integer, @@ -3670,7 +3670,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param id The identifier for the field. * @param description A description of the data stored in this info field. * @param lineType The type of the data stored in this info field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addScalarInfoHeaderLine(id: String, description: String, @@ -3696,7 +3696,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param id The identifier for the field. * @param description A description of the data stored in this info field. * @param lineType The type of the data stored in this info field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addAlternateAlleleArrayInfoHeaderLine(id: String, description: String, @@ -3717,7 +3717,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * @param id The identifier for the field. * @param description A description of the data stored in this info field. * @param lineType The type of the data stored in this info field. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addAllAlleleArrayInfoHeaderLine(id: String, description: String, @@ -3733,7 +3733,7 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti * * @param id The identifier for the filter. * @param description A description of the filter. - * @return A new RDD with the new header line added. + * @return A new genomic dataset with the new header line added. */ def addFilterHeaderLine(id: String, description: String): V = { @@ -3748,10 +3748,10 @@ private[rdd] trait VCFSupportingGenomicDataset[T, U <: Product, V <: VCFSupporti abstract class MultisampleAvroGenomicDataset[T <% IndexedRecord: Manifest, U <: Product, V <: MultisampleAvroGenomicDataset[T, U, V]] extends AvroGenomicDataset[T, U, V] with MultisampleGenomicDataset[T, U, V] { - override protected def saveMetadata(filePath: String): Unit = { - savePartitionMap(filePath) - saveSequences(filePath) - saveSamples(filePath) + override protected def saveMetadata(pathName: String): Unit = { + savePartitionMap(pathName) + saveSequences(pathName) + saveSamples(pathName) } } @@ -3773,7 +3773,7 @@ abstract class AvroGenomicDataset[T <% IndexedRecord: Manifest, U <: Product, V } /** - * Saves an RDD of Avro data to Parquet. + * Saves a genomic dataset of Avro data to Parquet. * * @param pathName The path to save the file to. * @param blockSize The size in bytes of blocks to write. Defaults to 128 * 1024 * 1024. @@ -3816,9 +3816,9 @@ abstract class AvroGenomicDataset[T <% IndexedRecord: Manifest, U <: Product, V * Save the partition map to disk. This is done by adding the partition * map to the schema. * - * @param filePath The filepath where we will save the partition map. + * @param pathName The filepath where we will save the partition map. */ - protected def savePartitionMap(filePath: String): Unit = { + protected def savePartitionMap(pathName: String): Unit = { if (isSorted) { // converting using json4s val jsonString = "partitionMap" -> optPartitionMap.get.toSeq.map(f => @@ -3835,19 +3835,37 @@ abstract class AvroGenomicDataset[T <% IndexedRecord: Manifest, U <: Product, V val schema = Contig.SCHEMA$ schema.addProp("partitionMap", compact(render(jsonString)).asInstanceOf[Any]) - saveAvro("%s/_partitionMap.avro".format(filePath), + saveAvro("%s/_partitionMap.avro".format(pathName), rdd.context, schema, sequences.toAvro) } } + /** + * Called in saveAsParquet after saving genomic dataset to Parquet to save metadata. + * + * Writes any necessary metadata to disk. If not overridden, writes the + * sequence dictionary to disk as Avro. + * + * @param pathName The filepath to the file where we will save the Metadata. + */ override protected def saveMetadata(pathName: String): Unit = { savePartitionMap(pathName) saveSequences(pathName) } - override def saveAsParquet( + /** + * Saves this genomic dataset to disk as a Parquet file. + * + * @param pathName Path to save the file at. + * @param blockSize Size per block. + * @param pageSize Size per page. + * @param compressCodec Name of the compression codec to use. + * @param disableDictionaryEncoding Whether or not to disable bit-packing. + * Default is false. + */ + def saveAsParquet( pathName: String, blockSize: Int = 128 * 1024 * 1024, pageSize: Int = 1 * 1024 * 1024, @@ -3862,9 +3880,9 @@ abstract class AvroGenomicDataset[T <% IndexedRecord: Manifest, U <: Product, V } /** - * Saves this RDD to disk as a Parquet + Avro file. + * Saves this genomic dataset to disk as a Parquet file. * - * @param pathName The path to save the file to. + * @param pathName Path to save the file at. * @param blockSize The size in bytes of blocks to write. * @param pageSize The size in bytes of pages to write. * @param compressCodec The compression codec to apply to pages. @@ -3886,13 +3904,27 @@ abstract class AvroGenomicDataset[T <% IndexedRecord: Manifest, U <: Product, V } /** - * Saves this RDD to disk as a Parquet + Avro file. + * Saves this genomic dataset to disk as a Parquet file. * - * @param pathName The path to save the file to. + * @param pathName Path to save the file at. */ def saveAsParquet(pathName: java.lang.String) { saveAsParquet(new JavaSaveArgs(pathName)) } + + /** + * Save partition size into the partitioned Parquet flag file. + * + * @param pathName Path to save the file at. + * @param partitionSize Partition bin size, in base pairs, used in Hive-style partitioning. + */ + private def writePartitionedParquetFlag(pathName: String, partitionSize: Int): Unit = { + val path = new Path(pathName, "_partitionedByStartPos") + val fs: FileSystem = path.getFileSystem(rdd.context.hadoopConfiguration) + val f = fs.create(path) + f.writeInt(partitionSize) + f.close() + } } private[rdd] class InstrumentedADAMAvroParquetOutputFormat extends InstrumentedOutputFormat[Void, IndexedRecord] { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala index a81a51a5b6..766f7b36cb 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala @@ -31,25 +31,25 @@ private[rdd] class InFormatterRunner[T, U <: Product, V <: GenomicDataset[T, U, } /** - * A trait for singleton objects that build an InFormatter from a GenomicRDD. + * A trait for singleton objects that build an InFormatter from a GenomicDataset. * * Often, when creating an outputstream, we need to add metadata to the output * that is not attached to individual records. An example of this is writing a * header with contig/read group/format info, as is done with SAM/BAM/VCF. * * @tparam T The type of the records this InFormatter writes out. - * @tparam U The type of the GenomicRDD this companion object understands. + * @tparam U The type of the GenomicDataset this companion object understands. * @tparam V The type of InFormatter this companion object creates. */ trait InFormatterCompanion[T, U <: Product, V <: GenomicDataset[T, U, V], W <: InFormatter[T, U, V, W]] { /** - * Creates an InFormatter from a GenomicRDD. + * Creates an InFormatter from a GenomicDataset. * - * @param gRdd The GenomicRDD to get metadata from. + * @param gDataset The GenomicDataset to get metadata from. * @return Returns an InFormatter with attached metadata. */ - def apply(gRdd: V): W + def apply(gDataset: V): W } /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDataset.scala similarity index 84% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDataset.scala index bae8675886..2e98ce6776 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDataset.scala @@ -73,16 +73,16 @@ private[adam] class NucleotideContigFragmentArraySerializer extends IntervalArra } } -object NucleotideContigFragmentRDD extends Serializable { +object NucleotideContigFragmentDataset extends Serializable { /** - * Builds a NucleotideContigFragmentRDD when no sequence dictionary is given. + * Builds a NucleotideContigFragmentDataset when no sequence dictionary is given. * * @param rdd Underlying RDD. We recompute the sequence dictionary from * this RDD. - * @return Returns a new NucleotideContigFragmentRDD. + * @return Returns a new NucleotideContigFragmentDataset. */ - private[rdd] def apply(rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + private[rdd] def apply(rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { // get sequence dictionary val sd = new SequenceDictionary(rdd.flatMap(ncf => { @@ -95,27 +95,27 @@ object NucleotideContigFragmentRDD extends Serializable { .collect .toVector) - NucleotideContigFragmentRDD(rdd, sd) + NucleotideContigFragmentDataset(rdd, sd) } /** - * Builds a NucleotideContigFragmentRDD without a partition map. + * Builds a NucleotideContigFragmentDataset without a partition map. * * @param rdd The underlying NucleotideContigFragment RDD. * @param sequences The sequence dictionary for the RDD. - * @return A new NucleotideContigFragmentRDD. + * @return A new NucleotideContigFragmentDataset. */ def apply(rdd: RDD[NucleotideContigFragment], - sequences: SequenceDictionary): NucleotideContigFragmentRDD = { + sequences: SequenceDictionary): NucleotideContigFragmentDataset = { - RDDBoundNucleotideContigFragmentRDD(rdd, sequences, None) + RDDBoundNucleotideContigFragmentDataset(rdd, sequences, None) } } -case class ParquetUnboundNucleotideContigFragmentRDD private[rdd] ( +case class ParquetUnboundNucleotideContigFragmentDataset private[rdd] ( @transient private val sc: SparkContext, private val parquetFilename: String, - sequences: SequenceDictionary) extends NucleotideContigFragmentRDD { + sequences: SequenceDictionary) extends NucleotideContigFragmentDataset { protected lazy val optPartitionMap = sc.extractPartitionMap(parquetFilename) @@ -130,18 +130,18 @@ case class ParquetUnboundNucleotideContigFragmentRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): NucleotideContigFragmentRDD = { + newSequences: SequenceDictionary): NucleotideContigFragmentDataset = { copy(sequences = newSequences) } } -case class DatasetBoundNucleotideContigFragmentRDD private[rdd] ( +case class DatasetBoundNucleotideContigFragmentDataset private[rdd] ( dataset: Dataset[NucleotideContigFragmentProduct], sequences: SequenceDictionary, override val isPartitioned: Boolean = true, override val optPartitionBinSize: Option[Int] = Some(1000000), - override val optLookbackPartitions: Option[Int] = Some(1)) extends NucleotideContigFragmentRDD - with DatasetBoundGenomicDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { + override val optLookbackPartitions: Option[Int] = Some(1)) extends NucleotideContigFragmentDataset + with DatasetBoundGenomicDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { lazy val rdd: RDD[NucleotideContigFragment] = dataset.rdd.map(_.toAvro) @@ -162,7 +162,7 @@ case class DatasetBoundNucleotideContigFragmentRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): NucleotideContigFragmentRDD = { + newSequences: SequenceDictionary): NucleotideContigFragmentDataset = { copy(sequences = newSequences) } } @@ -173,10 +173,10 @@ case class DatasetBoundNucleotideContigFragmentRDD private[rdd] ( * @param rdd Underlying RDD * @param sequences Sequence dictionary computed from rdd */ -case class RDDBoundNucleotideContigFragmentRDD private[rdd] ( +case class RDDBoundNucleotideContigFragmentDataset private[rdd] ( rdd: RDD[NucleotideContigFragment], sequences: SequenceDictionary, - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends NucleotideContigFragmentRDD { + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends NucleotideContigFragmentDataset { /** * A SQL Dataset of contig fragments. @@ -188,12 +188,12 @@ case class RDDBoundNucleotideContigFragmentRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): NucleotideContigFragmentRDD = { + newSequences: SequenceDictionary): NucleotideContigFragmentDataset = { copy(sequences = newSequences) } } -sealed abstract class NucleotideContigFragmentRDD extends AvroGenomicDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD] { +sealed abstract class NucleotideContigFragmentDataset extends AvroGenomicDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { protected val productFn = NucleotideContigFragmentProduct.fromAvro(_) protected val unproductFn = (c: NucleotideContigFragmentProduct) => c.toAvro @@ -215,22 +215,22 @@ sealed abstract class NucleotideContigFragmentRDD extends AvroGenomicDataset[Nuc FragmentConverter.convertRdd(rdd) } - def union(rdds: NucleotideContigFragmentRDD*): NucleotideContigFragmentRDD = { - val iterableRdds = rdds.toSeq - NucleotideContigFragmentRDD(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), - iterableRdds.map(_.sequences).fold(sequences)(_ ++ _)) + def union(datasets: NucleotideContigFragmentDataset*): NucleotideContigFragmentDataset = { + val iterableDatasets = datasets.toSeq + NucleotideContigFragmentDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _)) } /** * Replaces the underlying RDD with a new RDD. * - * @param newRdd The RDD to use for the new NucleotideContigFragmentRDD. - * @return Returns a new NucleotideContigFragmentRDD where the underlying RDD + * @param newRdd The RDD to use for the new NucleotideContigFragmentDataset. + * @return Returns a new NucleotideContigFragmentDataset where the underlying RDD * has been replaced. */ protected def replaceRdd(newRdd: RDD[NucleotideContigFragment], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): NucleotideContigFragmentRDD = { - new RDDBoundNucleotideContigFragmentRDD(newRdd, sequences, newPartitionMap) + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): NucleotideContigFragmentDataset = { + new RDDBoundNucleotideContigFragmentDataset(newRdd, sequences, newPartitionMap) } /** @@ -244,16 +244,16 @@ sealed abstract class NucleotideContigFragmentRDD extends AvroGenomicDataset[Nuc } /** - * Applies a function that transforms the underlying RDD into a new RDD using + * Applies a function that transforms the underlying Dataset into a new Dataset using * the Spark SQL API. * - * @param tFn A function that transforms the underlying RDD as a Dataset. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying Dataset as a Dataset. + * @return A new genomic dataset where the Dataset of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) is copied without modification. */ def transformDataset( - tFn: Dataset[NucleotideContigFragmentProduct] => Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentRDD = { - DatasetBoundNucleotideContigFragmentRDD(tFn(dataset), sequences) + tFn: Dataset[NucleotideContigFragmentProduct] => Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { + DatasetBoundNucleotideContigFragmentDataset(tFn(dataset), sequences) } /** @@ -322,10 +322,10 @@ sealed abstract class NucleotideContigFragmentRDD extends AvroGenomicDataset[Nuc /** * Merge fragments by contig name. * - * @return Returns a NucleotideContigFragmentRDD containing a single fragment + * @return Returns a NucleotideContigFragmentDataset containing a single fragment * per contig. */ - def mergeFragments(): NucleotideContigFragmentRDD = { + def mergeFragments(): NucleotideContigFragmentDataset = { def merge(first: NucleotideContigFragment, second: NucleotideContigFragment): NucleotideContigFragment = { val merged = NucleotideContigFragment.newBuilder(first) @@ -443,29 +443,29 @@ sealed abstract class NucleotideContigFragmentRDD extends AvroGenomicDataset[Nuc } /** - * For all adjacent records in the RDD, we extend the records so that the adjacent + * For all adjacent records in the genomic dataset, we extend the records so that the adjacent * records now overlap by _n_ bases, where _n_ is the flank length. * * Java friendly variant. * * @param flankLength The length to extend adjacent records by. - * @return Returns the RDD, with all adjacent fragments extended with flanking sequence. + * @return Returns the genomic dataset, with all adjacent fragments extended with flanking sequence. */ def flankAdjacentFragments( - flankLength: java.lang.Integer): NucleotideContigFragmentRDD = { + flankLength: java.lang.Integer): NucleotideContigFragmentDataset = { val flank: Int = flankLength flankAdjacentFragments(flank) } /** - * For all adjacent records in the RDD, we extend the records so that the adjacent + * For all adjacent records in the genomic dataset, we extend the records so that the adjacent * records now overlap by _n_ bases, where _n_ is the flank length. * * @param flankLength The length to extend adjacent records by. - * @return Returns the RDD, with all adjacent fragments extended with flanking sequence. + * @return Returns the genomic dataset, with all adjacent fragments extended with flanking sequence. */ def flankAdjacentFragments( - flankLength: Int): NucleotideContigFragmentRDD = { + flankLength: Int): NucleotideContigFragmentDataset = { replaceRdd(FlankReferenceFragments(rdd, sequences, flankLength)) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/BEDInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/BEDInFormatter.scala index 86ad8e766a..f7f1733ab1 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/BEDInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/BEDInFormatter.scala @@ -30,19 +30,19 @@ import org.bdgenomics.utils.misc.Logging /** * InFormatter companion that builds a BEDInFormatter to write features in BED format to a pipe. */ -object BEDInFormatter extends InFormatterCompanion[Feature, FeatureProduct, FeatureRDD, BEDInFormatter] { +object BEDInFormatter extends InFormatterCompanion[Feature, FeatureProduct, FeatureDataset, BEDInFormatter] { /** - * Apply method for building the BEDInFormatter from a FeatureRDD. + * Apply method for building the BEDInFormatter from a FeatureDataset. * - * @param fRdd FeatureRDD to build from. + * @param fRdd FeatureDataset to build from. */ - def apply(fRdd: FeatureRDD): BEDInFormatter = { + def apply(fRdd: FeatureDataset): BEDInFormatter = { BEDInFormatter() } } -case class BEDInFormatter private () extends InFormatter[Feature, FeatureProduct, FeatureRDD, BEDInFormatter] { +case class BEDInFormatter private () extends InFormatter[Feature, FeatureProduct, FeatureDataset, BEDInFormatter] { protected val companion = BEDInFormatter /** @@ -56,7 +56,7 @@ case class BEDInFormatter private () extends InFormatter[Feature, FeatureProduct // write the features iter.foreach(f => { - writer.write(FeatureRDD.toBed(f)) + writer.write(FeatureDataset.toBed(f)) writer.newLine() }) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageDataset.scala similarity index 73% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageDataset.scala index 461b9e8c5e..53d6f580ec 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageDataset.scala @@ -69,19 +69,19 @@ private[adam] class CoverageArraySerializer(kryo: Kryo) extends IntervalArraySer } } -object CoverageRDD { +object CoverageDataset { /** - * A GenomicRDD that wraps a dataset of Coverage data. + * A GenomicDataset that wraps a dataset of Coverage data with an empty sequence dictionary. * * @param ds A Dataset of genomic Coverage features. */ - def apply(ds: Dataset[Coverage]): CoverageRDD = { - new DatasetBoundCoverageRDD(ds, SequenceDictionary.empty, Seq.empty[Sample]) + def apply(ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, SequenceDictionary.empty, Seq.empty[Sample]) } /** - * A GenomicRDD that wraps a dataset of Coverage data. + * A GenomicDataset that wraps a dataset of Coverage data given a sequence dictionary. * * @param ds A Dataset of genomic Coverage features. * @param sequences The reference genome these data are aligned to. @@ -89,40 +89,40 @@ object CoverageRDD { */ def apply(ds: Dataset[Coverage], sequences: SequenceDictionary, - samples: Seq[Sample]): CoverageRDD = { - new DatasetBoundCoverageRDD(ds, sequences, samples) + samples: Seq[Sample]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, sequences, samples) } /** - * Builds a CoverageRDD with an empty sequence dictionary. + * A CoverageDataset that wraps an RDD of Coverage data with an empty sequence dictionary. * * @param rdd The underlying Coverage RDD to build from. - * @return Returns a new CoverageRDD. + * @return Returns a new CoverageDataset. */ - def apply(rdd: RDD[Coverage]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, SequenceDictionary.empty, Seq.empty[Sample], None) + def apply(rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, SequenceDictionary.empty, Seq.empty[Sample], None) } /** - * Builds a CoverageRDD given a sequence dictionary. + * A CoverageDataset that wraps an RDD of Coverage data given a sequence dictionary. * * @param rdd The underlying Coverage RDD to build from. - * @param sd The sequence dictionary for this Coverage RDD. - * @param samples The samples in this Coverage RDD. - * @return Returns a new CoverageRDD. + * @param sd The sequence dictionary for this CoverageDataset. + * @param samples The samples in this CoverageDataset. + * @return Returns a new CoverageDataset. */ def apply(rdd: RDD[Coverage], sd: SequenceDictionary, - samples: Seq[Sample]): CoverageRDD = { - new RDDBoundCoverageRDD(rdd, sd, samples, None) + samples: Seq[Sample]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, sd, samples, None) } } -case class ParquetUnboundCoverageRDD private[rdd] ( +case class ParquetUnboundCoverageDataset private[rdd] ( @transient private val sc: SparkContext, private val parquetFilename: String, sequences: SequenceDictionary, - @transient samples: Seq[Sample]) extends CoverageRDD { + @transient samples: Seq[Sample]) extends CoverageDataset { lazy val rdd: RDD[Coverage] = { sc.loadParquetCoverage(parquetFilename, @@ -141,47 +141,27 @@ case class ParquetUnboundCoverageRDD private[rdd] ( .as[Coverage] } - def toFeatures(): FeatureRDD = { - ParquetUnboundFeatureRDD(sc, parquetFilename, sequences, samples) + override def toFeatures(): FeatureDataset = { + ParquetUnboundFeatureDataset(sc, parquetFilename, sequences, samples) } - /** - * Replaces the sequence dictionary attached to a CoverageRDD. - * - * @param newSequences The new sequence dictionary to attach. - * @return Returns a new CoverageRDD with the sequences replaced. - */ - override def replaceSequences( - newSequences: SequenceDictionary): CoverageRDD = { + override def replaceSequences(newSequences: SequenceDictionary): CoverageDataset = { copy(sequences = newSequences) } - /** - * Replaces the sample metadata attached to the RDD. - * - * @param newSamples The new sample metadata to attach. - * @return A CoverageRDD with new sample metadata. - */ - override def replaceSamples(newSamples: Iterable[Sample]): CoverageRDD = { + override def replaceSamples(newSamples: Iterable[Sample]): CoverageDataset = { copy(samples = newSamples.toSeq) } } -/** - * A Dataset containing Coverage data. - * - * @param dataset A SQL Dataset containing data describing how many reads cover - * a genomic locus/region. - * @param sequences A dictionary describing the reference genome. - */ -case class DatasetBoundCoverageRDD private[rdd] ( +case class DatasetBoundCoverageDataset private[rdd] ( dataset: Dataset[Coverage], sequences: SequenceDictionary, @transient samples: Seq[Sample], override val isPartitioned: Boolean = false, override val optPartitionBinSize: Option[Int] = None, - override val optLookbackPartitions: Option[Int] = None) extends CoverageRDD - with DatasetBoundGenomicDataset[Coverage, Coverage, CoverageRDD] { + override val optLookbackPartitions: Option[Int] = None) extends CoverageDataset + with DatasetBoundGenomicDataset[Coverage, Coverage, CoverageDataset] { protected lazy val optPartitionMap = None @@ -189,45 +169,25 @@ case class DatasetBoundCoverageRDD private[rdd] ( dataset.rdd } - def toFeatures(): FeatureRDD = { + override def toFeatures(): FeatureDataset = { import dataset.sqlContext.implicits._ - DatasetBoundFeatureRDD(dataset.map(_.toSqlFeature), sequences, samples) + DatasetBoundFeatureDataset(dataset.map(_.toSqlFeature), sequences, samples) } - /** - * Replaces the sequence dictionary attached to a DatasetBoundCoverageRDD. - * - * @param newSequences The new sequence dictionary to attach. - * @return Returns a new DatasetBoundCoverageRDD with the sequences replaced. - */ - override def replaceSequences( - newSequences: SequenceDictionary): CoverageRDD = { + override def replaceSequences(newSequences: SequenceDictionary): CoverageDataset = { copy(sequences = newSequences) } - /** - * Replaces the sample metadata attached to the DatasetBoundCoverageRDD. - * - * @param newSamples The new sample metadata to attach. - * @return A DatasetBoundCoverageRDD with new sample metadata. - */ - override def replaceSamples(newSamples: Iterable[Sample]): CoverageRDD = { + override def replaceSamples(newSamples: Iterable[Sample]): CoverageDataset = { copy(samples = newSamples.toSeq) } } -/** - * An RDD containing Coverage data. - * - * @param rdd An RDD containing data describing how many reads cover a genomic - * locus/region. - * @param sequences A dictionary describing the reference genome. - */ -case class RDDBoundCoverageRDD private[rdd] ( +case class RDDBoundCoverageDataset private[rdd] ( rdd: RDD[Coverage], sequences: SequenceDictionary, @transient samples: Seq[Sample], - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends CoverageRDD { + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends CoverageDataset { lazy val dataset: Dataset[Coverage] = { val sqlContext = SQLContext.getOrCreate(rdd.context) @@ -235,24 +195,23 @@ case class RDDBoundCoverageRDD private[rdd] ( sqlContext.createDataset(rdd) } - def toFeatures(): FeatureRDD = { - val featureRdd = rdd.map(_.toFeature) - new RDDBoundFeatureRDD(featureRdd, sequences, samples, optPartitionMap = optPartitionMap) + override def toFeatures(): FeatureDataset = { + val features = rdd.map(_.toFeature) + new RDDBoundFeatureDataset(features, sequences, samples, optPartitionMap = optPartitionMap) } - override def replaceSequences( - newSequences: SequenceDictionary): CoverageRDD = { + override def replaceSequences(newSequences: SequenceDictionary): CoverageDataset = { copy(sequences = newSequences) } - override def replaceSamples(newSamples: Iterable[Sample]): CoverageRDD = { + override def replaceSamples(newSamples: Iterable[Sample]): CoverageDataset = { copy(samples = newSamples.toSeq) } } -abstract class CoverageRDD - extends MultisampleGenomicDataset[Coverage, Coverage, CoverageRDD] - with GenomicDataset[Coverage, Coverage, CoverageRDD] { +abstract class CoverageDataset + extends MultisampleGenomicDataset[Coverage, Coverage, CoverageDataset] + with GenomicDataset[Coverage, Coverage, CoverageDataset] { protected val productFn = (c: Coverage) => c protected val unproductFn = (c: Coverage) => c @@ -264,20 +223,19 @@ abstract class CoverageRDD IntervalArray(rdd, CoverageArray.apply(_, _)) } - def union(rdds: CoverageRDD*): CoverageRDD = { - val iterableRdds = rdds.toSeq - - val mergedSequences = iterableRdds.map(_.sequences).fold(sequences)(_ ++ _) - val mergedSamples = (samples ++ iterableRdds.flatMap(_.samples)).distinct.toSeq + def union(datasets: CoverageDataset*): CoverageDataset = { + val iterableDatasets = datasets.toSeq + val mergedSequences = iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _) + val mergedSamples = (samples ++ iterableDatasets.flatMap(_.samples)).distinct.toSeq - if (iterableRdds.forall(rdd => rdd match { - case DatasetBoundCoverageRDD(_, _, _, _, _, _) => true - case _ => false + if (iterableDatasets.forall(dataset => dataset match { + case DatasetBoundCoverageDataset(_, _, _, _, _, _) => true + case _ => false })) { - DatasetBoundCoverageRDD(iterableRdds.map(_.dataset) + DatasetBoundCoverageDataset(iterableDatasets.map(_.dataset) .fold(dataset)(_.union(_)), mergedSequences, mergedSamples) } else { - RDDBoundCoverageRDD(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), + RDDBoundCoverageDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), mergedSequences, mergedSamples, None) @@ -298,14 +256,14 @@ abstract class CoverageRDD } def transformDataset( - tFn: Dataset[Coverage] => Dataset[Coverage]): CoverageRDD = { - DatasetBoundCoverageRDD(tFn(dataset), sequences, samples) + tFn: Dataset[Coverage] => Dataset[Coverage]): CoverageDataset = { + DatasetBoundCoverageDataset(tFn(dataset), sequences, samples) } /** * Saves coverage as feature file. * - * @see FeatureRDD.save + * @see FeatureDataset.save * * Supported file formats include bed, narrowPeak and parquet. Coverage is saved * as a feature where coverage is stored in score attribute. @@ -340,7 +298,7 @@ abstract class CoverageRDD * * @return merged tuples of adjacent ReferenceRegions and coverage. */ - def collapse(): CoverageRDD = { + def collapse(): CoverageDataset = { val newRDD: RDD[Coverage] = rdd .mapPartitions(iter => { // must sort values to iteratively collapse coverage @@ -391,11 +349,11 @@ abstract class CoverageRDD } /** - * Converts CoverageRDD to FeatureRDD. + * Converts CoverageDataset to FeatureDataset. * - * @return Returns a FeatureRDD from CoverageRDD. + * @return Returns a FeatureDataset from CoverageDataset. */ - def toFeatures(): FeatureRDD + def toFeatures(): FeatureDataset /** * Gets coverage overlapping specified ReferenceRegion. @@ -405,9 +363,9 @@ abstract class CoverageRDD * coverage of the first base pair in that bin. Java friendly variant. * * @param bpPerBin base pairs per bin, number of bases to combine to one bin. - * @return RDD of Coverage Records. + * @return Genomic dataset of Coverage Records. */ - def coverage(bpPerBin: java.lang.Integer): CoverageRDD = { + def coverage(bpPerBin: java.lang.Integer): CoverageDataset = { val bp: Int = bpPerBin coverage(bpPerBin = bp) } @@ -419,9 +377,9 @@ abstract class CoverageRDD * coverage of the first base pair in that bin. * * @param bpPerBin base pairs per bin, number of bases to combine to one bin. - * @return RDD of Coverage Records. + * @return Genomic dataset of Coverage Records. */ - def coverage(bpPerBin: Int = 1): CoverageRDD = { + def coverage(bpPerBin: Int = 1): CoverageDataset = { val flattened = flatten() @@ -442,9 +400,9 @@ abstract class CoverageRDD * the mean coverage over all base pairs in that bin. Java friendly variant. * * @param bpPerBin base pairs per bin, number of bases to combine to one bin. - * @return RDD of Coverage Records. + * @return Genomic dataset of Coverage Records. */ - def aggregatedCoverage(bpPerBin: java.lang.Integer): CoverageRDD = { + def aggregatedCoverage(bpPerBin: java.lang.Integer): CoverageDataset = { val bp: Int = bpPerBin aggregatedCoverage(bpPerBin = bp) } @@ -457,9 +415,9 @@ abstract class CoverageRDD * the mean coverage over all base pairs in that bin. * * @param bpPerBin base pairs per bin, number of bases to combine to one bin. - * @return RDD of Coverage Records. + * @return Genomic dataset of Coverage Records. */ - def aggregatedCoverage(bpPerBin: Int = 1): CoverageRDD = { + def aggregatedCoverage(bpPerBin: Int = 1): CoverageDataset = { val flattened = flatten() @@ -501,19 +459,19 @@ abstract class CoverageRDD /** * @param newRdd The RDD to replace the underlying RDD with. - * @return Returns a new CoverageRDD with the underlying RDD replaced. + * @return Returns a new CoverageDataset with the underlying RDD replaced. */ protected def replaceRdd(newRdd: RDD[Coverage], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): CoverageRDD = { - RDDBoundCoverageRDD(newRdd, sequences, samples, newPartitionMap) + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): CoverageDataset = { + RDDBoundCoverageDataset(newRdd, sequences, samples, newPartitionMap) } /** - * Gets flattened RDD of coverage, with coverage mapped to a ReferenceRegion at each base pair. + * Gets flattened genomic dataset of coverage, with coverage mapped to a ReferenceRegion at each base pair. * - * @return CoverageRDD of flattened Coverage records. + * @return CoverageDataset of flattened Coverage records. */ - def flatten(): CoverageRDD = { + def flatten(): CoverageDataset = { transform(rdd => flatMapCoverage(rdd)) } @@ -521,7 +479,7 @@ abstract class CoverageRDD * Flat maps coverage into ReferenceRegion and counts for each base pair. * * @param rdd RDD of Coverage. - * @return RDD of flattened Coverage. + * @return Genomic dataset of flattened Coverage. */ private def flatMapCoverage(rdd: RDD[Coverage]): RDD[Coverage] = { rdd.flatMap(r => { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureDataset.scala similarity index 77% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureDataset.scala index 2d9d139af7..0ce3dd46b8 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureDataset.scala @@ -106,42 +106,51 @@ private trait FeatureOrdering[T <: Feature] extends Ordering[T] { } private object FeatureOrdering extends FeatureOrdering[Feature] {} -object FeatureRDD { +object FeatureDataset { /** - * A GenomicRDD that wraps a dataset of Feature data. + * A GenomicDataset that wraps a Dataset of Feature data with an empty sequence dictionary. * * @param ds A Dataset of genomic Features. - * @param sequences The reference genome these data are aligned to. + */ + def apply(ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, SequenceDictionary.empty, Seq.empty[Sample]) + } + + /** + * A GenomicDataset that wraps a Dataset of Feature data given a sequence dictionary. + * + * @param ds A Dataset of genomic Features. + * @param sd The reference genome these data are aligned to. */ def apply(ds: Dataset[FeatureProduct], sequences: SequenceDictionary, - samples: Iterable[Sample]): FeatureRDD = { - new DatasetBoundFeatureRDD(ds, sequences, samples.toSeq) + samples: Iterable[Sample]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, sequences, samples.toSeq) } /** - * Builds a FeatureRDD with an empty sequence dictionary. + * Builds a FeatureDataset that wraps an RDD of Feature data with an empty sequence dictionary. * * @param rdd The underlying Feature RDD to build from. - * @return Returns a new FeatureRDD. + * @return Returns a new FeatureDataset. */ - def apply(rdd: RDD[Feature]): FeatureRDD = { - FeatureRDD(rdd, SequenceDictionary.empty, Iterable.empty[Sample]) + def apply(rdd: RDD[Feature]): FeatureDataset = { + FeatureDataset(rdd, SequenceDictionary.empty, Iterable.empty[Sample]) } /** - * Builds a FeatureRDD given a sequence dictionary. + * Builds a FeatureDataset that wraps an RDD of Feature data given a sequence dictionary. * * @param rdd The underlying Feature RDD to build from. - * @param sd The sequence dictionary for this FeatureRDD. - * @param samples The samples in this FeatureRDD. - * @return Returns a new FeatureRDD. + * @param sd The sequence dictionary for this FeatureDataset. + * @param samples The samples in this FeatureDataset. + * @return Returns a new FeatureDataset. */ def apply(rdd: RDD[Feature], sd: SequenceDictionary, - samples: Iterable[Sample]): FeatureRDD = { - new RDDBoundFeatureRDD(rdd, sd, samples.toSeq, None) + samples: Iterable[Sample]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, sd, samples.toSeq, None) } /** @@ -264,11 +273,11 @@ object FeatureRDD { } } -case class ParquetUnboundFeatureRDD private[rdd] ( +case class ParquetUnboundFeatureDataset private[rdd] ( @transient private val sc: SparkContext, private val parquetFilename: String, sequences: SequenceDictionary, - @transient samples: Seq[Sample]) extends FeatureRDD { + @transient samples: Seq[Sample]) extends FeatureDataset { lazy val rdd: RDD[Feature] = { sc.loadParquet(parquetFilename) @@ -282,39 +291,27 @@ case class ParquetUnboundFeatureRDD private[rdd] ( sqlContext.read.parquet(parquetFilename).as[FeatureProduct] } - /** - * Replaces the sequence dictionary attached to a ParquetUnboundFeatureRDD. - * - * @param newSequences The new sequence dictionary to attach. - * @return Returns a new ParquetUnboundFeatureRDD with the sequences replaced. - */ - override def replaceSequences(newSequences: SequenceDictionary): FeatureRDD = { + override def replaceSequences(newSequences: SequenceDictionary): FeatureDataset = { copy(sequences = newSequences) } - /** - * Replaces the sample metadata attached to the ParquetUnboundFeatureRDD. - * - * @param newSamples The new sample metadata to attach. - * @return A ParquetUnboundFeatureRDD with new sample metadata. - */ - override def replaceSamples(newSamples: Iterable[Sample]): FeatureRDD = { + override def replaceSamples(newSamples: Iterable[Sample]): FeatureDataset = { copy(samples = newSamples.toSeq) } - def toCoverage(): CoverageRDD = { - ParquetUnboundCoverageRDD(sc, parquetFilename, sequences, samples) + def toCoverage(): CoverageDataset = { + ParquetUnboundCoverageDataset(sc, parquetFilename, sequences, samples) } } -case class DatasetBoundFeatureRDD private[rdd] ( +case class DatasetBoundFeatureDataset private[rdd] ( dataset: Dataset[FeatureProduct], sequences: SequenceDictionary, @transient samples: Seq[Sample], override val isPartitioned: Boolean = true, override val optPartitionBinSize: Option[Int] = Some(1000000), - override val optLookbackPartitions: Option[Int] = Some(1)) extends FeatureRDD - with DatasetBoundGenomicDataset[Feature, FeatureProduct, FeatureRDD] { + override val optLookbackPartitions: Option[Int] = Some(1)) extends FeatureDataset + with DatasetBoundGenomicDataset[Feature, FeatureProduct, FeatureDataset] { lazy val rdd = dataset.rdd.map(_.toAvro) protected lazy val optPartitionMap = None @@ -334,93 +331,81 @@ case class DatasetBoundFeatureRDD private[rdd] ( } override def transformDataset( - tFn: Dataset[FeatureProduct] => Dataset[FeatureProduct]): FeatureRDD = { + tFn: Dataset[FeatureProduct] => Dataset[FeatureProduct]): FeatureDataset = { copy(dataset = tFn(dataset)) } - /** - * Replaces the sequence dictionary attached to a DatasetBoundFeatureRDD. - * - * @param newSequences The new sequence dictionary to attach. - * @return Returns a new DatasetBoundFeatureRDD with the sequences replaced. - */ - override def replaceSequences(newSequences: SequenceDictionary): FeatureRDD = { + override def replaceSequences(newSequences: SequenceDictionary): FeatureDataset = { copy(sequences = newSequences) } - /** - * Replaces the sample metadata attached to the DatasetBoundFeatureRDD. - * - * @param newSamples The new sample metadata to attach. - * @return A DatasetBoundFeatureRDD with new sample metadata. - */ - override def replaceSamples(newSamples: Iterable[Sample]): FeatureRDD = { + override def replaceSamples(newSamples: Iterable[Sample]): FeatureDataset = { copy(samples = newSamples.toSeq) } - def toCoverage(): CoverageRDD = { + def toCoverage(): CoverageDataset = { import dataset.sqlContext.implicits._ - DatasetBoundCoverageRDD(dataset.toDF + DatasetBoundCoverageDataset(dataset.toDF .select("contigName", "start", "end", "score", "sampleId") .withColumnRenamed("score", "count") .withColumnRenamed("sampleId", "optSampleId") .as[Coverage], sequences, samples) } - override def filterToFeatureType(featureType: String): FeatureRDD = { + override def filterToFeatureType(featureType: String): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("featureType").eqNullSafe(featureType))) } - override def filterToFeatureTypes(featureTypes: Seq[String]): FeatureRDD = { + override def filterToFeatureTypes(featureTypes: Seq[String]): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("featureType") isin (featureTypes: _*))) } - override def filterToGene(geneId: String): FeatureRDD = { + override def filterToGene(geneId: String): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("geneId").eqNullSafe(geneId))) } - override def filterToGenes(geneIds: Seq[String]): FeatureRDD = { + override def filterToGenes(geneIds: Seq[String]): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("geneId") isin (geneIds: _*))) } - override def filterToTranscript(transcriptId: String): FeatureRDD = { + override def filterToTranscript(transcriptId: String): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("transcriptId").eqNullSafe(transcriptId))) } - override def filterToTranscripts(transcriptIds: Seq[String]): FeatureRDD = { + override def filterToTranscripts(transcriptIds: Seq[String]): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("transcriptId") isin (transcriptIds: _*))) } - override def filterToExon(exonId: String): FeatureRDD = { + override def filterToExon(exonId: String): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("exonId").eqNullSafe(exonId))) } - override def filterToExons(exonIds: Seq[String]): FeatureRDD = { + override def filterToExons(exonIds: Seq[String]): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("exonId") isin (exonIds: _*))) } - override def filterByScore(minimumScore: Double): FeatureRDD = { + override def filterByScore(minimumScore: Double): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("score").geq(minimumScore))) } - override def filterToParent(parentId: String): FeatureRDD = { + override def filterToParent(parentId: String): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("parentIds").contains(parentId))) } - override def filterToParents(parentIds: Seq[String]): FeatureRDD = { + override def filterToParents(parentIds: Seq[String]): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("parentIds") isin (parentIds: _*))) } - override def filterByAttribute(key: String, value: String): FeatureRDD = { + override def filterByAttribute(key: String, value: String): FeatureDataset = { transformDataset(dataset => dataset.filter(dataset.col("attributes").getItem(key).eqNullSafe(value))) } } -case class RDDBoundFeatureRDD private[rdd] ( +case class RDDBoundFeatureDataset private[rdd] ( rdd: RDD[Feature], sequences: SequenceDictionary, @transient samples: Seq[Sample], - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends FeatureRDD { + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends FeatureDataset { /** * A SQL Dataset of reads. @@ -431,34 +416,22 @@ case class RDDBoundFeatureRDD private[rdd] ( sqlContext.createDataset(rdd.map(FeatureProduct.fromAvro)) } - /** - * Replaces the sequence dictionary attached to a RDDBoundFeatureRDD. - * - * @param newSequences The new sequence dictionary to attach. - * @return Returns a new RDDBoundFeatureRDD with the sequences replaced. - */ - override def replaceSequences(newSequences: SequenceDictionary): FeatureRDD = { + override def replaceSequences(newSequences: SequenceDictionary): FeatureDataset = { copy(sequences = newSequences) } - /** - * Replaces the sample metadata attached to the RDDBoundFeatureRDD. - * - * @param newSamples The new sample metadata to attach. - * @return A RDDBoundFeatureRDD with new sample metadata. - */ - override def replaceSamples(newSamples: Iterable[Sample]): FeatureRDD = { + override def replaceSamples(newSamples: Iterable[Sample]): FeatureDataset = { copy(samples = newSamples.toSeq) } - def toCoverage(): CoverageRDD = { + def toCoverage(): CoverageDataset = { val coverageRdd = rdd.map(f => Coverage(f)) - RDDBoundCoverageRDD(coverageRdd, sequences, samples, optPartitionMap) + RDDBoundCoverageDataset(coverageRdd, sequences, samples, optPartitionMap) } } -sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProduct, FeatureRDD] - with MultisampleGenomicDataset[Feature, FeatureProduct, FeatureRDD] { +sealed abstract class FeatureDataset extends AvroGenomicDataset[Feature, FeatureProduct, FeatureDataset] + with MultisampleGenomicDataset[Feature, FeatureProduct, FeatureDataset] { protected val productFn = FeatureProduct.fromAvro(_) protected val unproductFn = (f: FeatureProduct) => f.toAvro @@ -471,9 +444,9 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd } /** - * Saves metadata for a FeatureRDD, including partition map, sequences, and samples. + * Saves metadata for a FeatureDataset, including partition map, sequences, and samples. * - * @param pathName The path name to save meta data for this FeatureRDD. + * @param pathName The path name to save meta data for this FeatureDataset. */ override protected def saveMetadata(pathName: String): Unit = { savePartitionMap(pathName) @@ -481,24 +454,24 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd saveSamples(pathName) } - def union(rdds: FeatureRDD*): FeatureRDD = { - val iterableRdds = rdds.toSeq - FeatureRDD(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), - iterableRdds.map(_.sequences).fold(sequences)(_ ++ _), - iterableRdds.map(_.samples).fold(samples)(_ ++ _)) + def union(datasets: FeatureDataset*): FeatureDataset = { + val iterableDatasets = datasets.toSeq + FeatureDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _), + iterableDatasets.map(_.samples).fold(samples)(_ ++ _)) } /** - * Applies a function that transforms the underlying RDD into a new RDD using + * Applies a function that transforms the underlying Dataset into a new Dataset using * the Spark SQL API. * - * @param tFn A function that transforms the underlying RDD as a Dataset. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying Dataset as a Dataset. + * @return A new FeatureDataset where the Dataset of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) is copied without modification. */ def transformDataset( - tFn: Dataset[FeatureProduct] => Dataset[FeatureProduct]): FeatureRDD = { - DatasetBoundFeatureRDD(tFn(dataset), sequences, samples) + tFn: Dataset[FeatureProduct] => Dataset[FeatureProduct]): FeatureDataset = { + DatasetBoundFeatureDataset(tFn(dataset), sequences, samples) } /** @@ -550,140 +523,140 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd } /** - * Converts the FeatureRDD to a CoverageRDD. + * Converts the FeatureDataset to a CoverageDataset. * - * @return CoverageRDD containing RDD of Coverage. + * @return Genomic dataset containing Coverage records. */ - def toCoverage(): CoverageRDD + def toCoverage(): CoverageDataset /** - * Filter this FeatureRDD by feature type to those that match the specified feature type. + * Filter this FeatureDataset by feature type to those that match the specified feature type. * * @param featureType Feature type to filter by. - * @return FeatureRDD filtered by the specified feature type. + * @return FeatureDataset filtered by the specified feature type. */ - def filterToFeatureType(featureType: String): FeatureRDD = { + def filterToFeatureType(featureType: String): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getFeatureType).exists(_.equals(featureType)))) } /** - * Filter this FeatureRDD by feature type to those that match the specified feature types. + * Filter this FeatureDataset by feature type to those that match the specified feature types. * * @param featureType Sequence of feature types to filter by. - * @return FeatureRDD filtered by the specified feature types. + * @return FeatureDataset filtered by the specified feature types. */ - def filterToFeatureTypes(featureTypes: Seq[String]): FeatureRDD = { + def filterToFeatureTypes(featureTypes: Seq[String]): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getFeatureType).exists(featureTypes.contains(_)))) } /** - * Filter this FeatureRDD by gene to those that match the specified gene. + * Filter this FeatureDataset by gene to those that match the specified gene. * * @param geneId Gene to filter by. - * @return FeatureRDD filtered by the specified gene. + * @return FeatureDataset filtered by the specified gene. */ - def filterToGene(geneId: String): FeatureRDD = { + def filterToGene(geneId: String): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getGeneId).exists(_.equals(geneId)))) } /** - * Filter this FeatureRDD by gene to those that match the specified genes. + * Filter this FeatureDataset by gene to those that match the specified genes. * * @param geneIds Sequence of genes to filter by. - * @return FeatureRDD filtered by the specified genes. + * @return FeatureDataset filtered by the specified genes. */ - def filterToGenes(geneIds: Seq[String]): FeatureRDD = { + def filterToGenes(geneIds: Seq[String]): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getGeneId).exists(geneIds.contains(_)))) } /** - * Filter this FeatureRDD by transcript to those that match the specified transcript. + * Filter this FeatureDataset by transcript to those that match the specified transcript. * * @param transcriptId Transcript to filter by. - * @return FeatureRDD filtered by the specified transcript. + * @return FeatureDataset filtered by the specified transcript. */ - def filterToTranscript(transcriptId: String): FeatureRDD = { + def filterToTranscript(transcriptId: String): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getTranscriptId).exists(_.equals(transcriptId)))) } /** - * Filter this FeatureRDD by transcript to those that match the specified transcripts. + * Filter this FeatureDataset by transcript to those that match the specified transcripts. * * @param transcriptIds Sequence of transcripts to filter by. - * @return FeatureRDD filtered by the specified transcripts. + * @return FeatureDataset filtered by the specified transcripts. */ - def filterToTranscripts(transcriptIds: Seq[String]): FeatureRDD = { + def filterToTranscripts(transcriptIds: Seq[String]): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getTranscriptId).exists(transcriptIds.contains(_)))) } /** - * Filter this FeatureRDD by exon to those that match the specified exon. + * Filter this FeatureDataset by exon to those that match the specified exon. * * @param exonId Exon to filter by. - * @return FeatureRDD filtered by the specified exon. + * @return FeatureDataset filtered by the specified exon. */ - def filterToExon(exonId: String): FeatureRDD = { + def filterToExon(exonId: String): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getExonId).exists(_.equals(exonId)))) } /** - * Filter this FeatureRDD by exon to those that match the specified exons. + * Filter this FeatureDataset by exon to those that match the specified exons. * * @param exonIds Sequence of exons to filter by. - * @return FeatureRDD filtered by the specified exons. + * @return FeatureDataset filtered by the specified exons. */ - def filterToExons(exonIds: Seq[String]): FeatureRDD = { + def filterToExons(exonIds: Seq[String]): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getExonId).exists(exonIds.contains(_)))) } /** - * Filter this FeatureRDD by score. + * Filter this FeatureDataset by score. * * @param minimumScore Minimum score to filter by, inclusive. - * @return FeatureRDD filtered by the specified minimum score. + * @return FeatureDataset filtered by the specified minimum score. */ - def filterByScore(minimumScore: Double): FeatureRDD = { + def filterByScore(minimumScore: Double): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getScore).exists(_ >= minimumScore))) } /** - * Filter this FeatureRDD by parent to those that match the specified parent. + * Filter this FeatureDataset by parent to those that match the specified parent. * * @param parentId Parent to filter by. - * @return FeatureRDD filtered by the specified parent. + * @return FeatureDataset filtered by the specified parent. */ - def filterToParent(parentId: String): FeatureRDD = { + def filterToParent(parentId: String): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getParentIds).exists(_.contains(parentId)))) } /** - * Filter this FeatureRDD by parent to those that match the specified parents. + * Filter this FeatureDataset by parent to those that match the specified parents. * * @param parentIds Sequence of parents to filter by. - * @return FeatureRDD filtered by the specified parents. + * @return FeatureDataset filtered by the specified parents. */ - def filterToParents(parentIds: Seq[String]): FeatureRDD = { + def filterToParents(parentIds: Seq[String]): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getParentIds).exists(!Collections.disjoint(_, parentIds)))) } /** - * Filter this FeatureRDD by attribute to those that match the specified attribute key and value. + * Filter this FeatureDataset by attribute to those that match the specified attribute key and value. * * @param key Attribute key to filter by. * @param value Attribute value to filter by. - * @return FeatureRDD filtered by the specified attribute. + * @return FeatureDataset filtered by the specified attribute. */ - def filterByAttribute(key: String, value: String): FeatureRDD = { + def filterByAttribute(key: String, value: String): FeatureDataset = { transform(rdd => rdd.filter(f => Option(f.getAttributes.get(key)).exists(_.equals(value)))) } /** * @param newRdd The RDD to replace the underlying RDD with. - * @return Returns a new FeatureRDD with the underlying RDD replaced. + * @return Returns a new FeatureDataset with the underlying RDD replaced. */ protected def replaceRdd(newRdd: RDD[Feature], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): FeatureRDD = { - new RDDBoundFeatureRDD(newRdd, sequences, samples, newPartitionMap) + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): FeatureDataset = { + new RDDBoundFeatureDataset(newRdd, sequences, samples, newPartitionMap) } /** @@ -696,7 +669,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd } /** - * Save this FeatureRDD in GTF format. + * Save this FeatureDataset in GTF format. * * @param fileName The path to save GTF formatted text file(s) to. * @param asSingleFile By default (false), writes file to disk as shards with @@ -708,14 +681,14 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd def saveAsGtf(fileName: String, asSingleFile: Boolean = false, disableFastConcat: Boolean = false) = { - writeTextRdd(rdd.map(FeatureRDD.toGtf), + writeTextRdd(rdd.map(FeatureDataset.toGtf), fileName, asSingleFile, disableFastConcat) } /** - * Save this FeatureRDD in GFF3 format. + * Save this FeatureDataset in GFF3 format. * * @param fileName The path to save GFF3 formatted text file(s) to. * @param asSingleFile By default (false), writes file to disk as shards with @@ -734,7 +707,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd } else { None } - writeTextRdd(rdd.map(FeatureRDD.toGff3), + writeTextRdd(rdd.map(FeatureDataset.toGff3), fileName, asSingleFile, disableFastConcat, @@ -742,7 +715,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd } /** - * Save this FeatureRDD in UCSC BED format, where score is formatted as + * Save this FeatureDataset in UCSC BED format, where score is formatted as * integer values between 0 and 1000, with missing value as specified. * * @param fileName The path to save BED formatted text file(s) to. @@ -762,14 +735,14 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd maximumScore: Double, missingValue: Int = 0) = { - writeTextRdd(rdd.map(FeatureRDD.toBed(_, Some(minimumScore), Some(maximumScore), Some(missingValue))), + writeTextRdd(rdd.map(FeatureDataset.toBed(_, Some(minimumScore), Some(maximumScore), Some(missingValue))), fileName, asSingleFile, disableFastConcat) } /** - * Save this FeatureRDD in bedtools2 BED format, where score is formatted + * Save this FeatureDataset in bedtools2 BED format, where score is formatted * as double floating point values with missing values. * * @param fileName The path to save BED formatted text file(s) to. @@ -782,15 +755,14 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd def saveAsBed(fileName: String, asSingleFile: Boolean = false, disableFastConcat: Boolean = false) = { - - writeTextRdd(rdd.map(FeatureRDD.toBed), + writeTextRdd(rdd.map(FeatureDataset.toBed), fileName, asSingleFile, disableFastConcat) } /** - * Save this FeatureRDD in interval list format. + * Save this FeatureDataset in interval list format. * * @param fileName The path to save interval list formatted text file(s) to. * @param asSingleFile By default (false), writes file to disk as shards with @@ -802,7 +774,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd def saveAsIntervalList(fileName: String, asSingleFile: Boolean = false, disableFastConcat: Boolean = false) = { - val intervalEntities = rdd.map(FeatureRDD.toInterval) + val intervalEntities = rdd.map(FeatureDataset.toInterval) if (asSingleFile) { @@ -832,7 +804,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd } /** - * Save this FeatureRDD in NarrowPeak format. + * Save this FeatureDataset in NarrowPeak format. * * @param fileName The path to save NarrowPeak formatted text file(s) to. * @param asSingleFile By default (false), writes file to disk as shards with @@ -844,7 +816,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd def saveAsNarrowPeak(fileName: String, asSingleFile: Boolean = false, disableFastConcat: Boolean = false) { - writeTextRdd(rdd.map(FeatureRDD.toNarrowPeak), + writeTextRdd(rdd.map(FeatureDataset.toNarrowPeak), fileName, asSingleFile, disableFastConcat) @@ -857,7 +829,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd * @param numPartitions The number of partitions to have after sorting. * Defaults to the partition count of the underlying RDD. */ - def sortByReference(ascending: Boolean = true, numPartitions: Int = rdd.partitions.length): FeatureRDD = { + def sortByReference(ascending: Boolean = true, numPartitions: Int = rdd.partitions.length): FeatureDataset = { implicit def ord = FeatureOrdering replaceRdd(rdd.sortBy(f => f, ascending, numPartitions)) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GFF3InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GFF3InFormatter.scala index b2fa6eaa05..734d66f106 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GFF3InFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GFF3InFormatter.scala @@ -30,19 +30,19 @@ import org.bdgenomics.utils.misc.Logging /** * InFormatter companion that builds a GFF3InFormatter to write features in GFF3 format to a pipe. */ -object GFF3InFormatter extends InFormatterCompanion[Feature, FeatureProduct, FeatureRDD, GFF3InFormatter] { +object GFF3InFormatter extends InFormatterCompanion[Feature, FeatureProduct, FeatureDataset, GFF3InFormatter] { /** - * Apply method for building the GFF3InFormatter from a FeatureRDD. + * Apply method for building the GFF3InFormatter from a FeatureDataset. * - * @param fRdd FeatureRDD to build from. + * @param fRdd FeatureDataset to build from. */ - def apply(fRdd: FeatureRDD): GFF3InFormatter = { + def apply(fRdd: FeatureDataset): GFF3InFormatter = { GFF3InFormatter() } } -case class GFF3InFormatter private () extends InFormatter[Feature, FeatureProduct, FeatureRDD, GFF3InFormatter] { +case class GFF3InFormatter private () extends InFormatter[Feature, FeatureProduct, FeatureDataset, GFF3InFormatter] { protected val companion = GFF3InFormatter /** @@ -56,7 +56,7 @@ case class GFF3InFormatter private () extends InFormatter[Feature, FeatureProduc // write the features iter.foreach(f => { - writer.write(FeatureRDD.toGff3(f)) + writer.write(FeatureDataset.toGff3(f)) writer.newLine() }) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GTFInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GTFInFormatter.scala index a0c5c8e40c..c3ae0e1c53 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GTFInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GTFInFormatter.scala @@ -30,19 +30,19 @@ import org.bdgenomics.utils.misc.Logging /** * InFormatter companion that builds a GTFInFormatter to write features in GTF format to a pipe. */ -object GTFInFormatter extends InFormatterCompanion[Feature, FeatureProduct, FeatureRDD, GTFInFormatter] { +object GTFInFormatter extends InFormatterCompanion[Feature, FeatureProduct, FeatureDataset, GTFInFormatter] { /** - * Apply method for building the GTFInFormatter from a FeatureRDD. + * Apply method for building the GTFInFormatter from a FeatureDataset. * - * @param fRdd FeatureRDD to build from. + * @param fRdd FeatureDataset to build from. */ - def apply(fRdd: FeatureRDD): GTFInFormatter = { + def apply(fRdd: FeatureDataset): GTFInFormatter = { GTFInFormatter() } } -case class GTFInFormatter private () extends InFormatter[Feature, FeatureProduct, FeatureRDD, GTFInFormatter] { +case class GTFInFormatter private () extends InFormatter[Feature, FeatureProduct, FeatureDataset, GTFInFormatter] { protected val companion = GTFInFormatter /** @@ -56,7 +56,7 @@ case class GTFInFormatter private () extends InFormatter[Feature, FeatureProduct // write the features iter.foreach(f => { - writer.write(FeatureRDD.toGtf(f)) + writer.write(FeatureDataset.toGtf(f)) writer.newLine() }) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/NarrowPeakInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/NarrowPeakInFormatter.scala index de1bb19499..5a88c2cffd 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/NarrowPeakInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/NarrowPeakInFormatter.scala @@ -30,19 +30,19 @@ import org.bdgenomics.utils.misc.Logging /** * InFormatter companion that builds a NarrowPeakInFormatter to write features in NarrowPeak format to a pipe. */ -object NarrowPeakInFormatter extends InFormatterCompanion[Feature, FeatureProduct, FeatureRDD, NarrowPeakInFormatter] { +object NarrowPeakInFormatter extends InFormatterCompanion[Feature, FeatureProduct, FeatureDataset, NarrowPeakInFormatter] { /** - * Apply method for building the NarrowPeakInFormatter from a FeatureRDD. + * Apply method for building the NarrowPeakInFormatter from a FeatureDataset. * - * @param fRdd FeatureRDD to build from. + * @param fRdd FeatureDataset to build from. */ - def apply(fRdd: FeatureRDD): NarrowPeakInFormatter = { + def apply(fRdd: FeatureDataset): NarrowPeakInFormatter = { NarrowPeakInFormatter() } } -case class NarrowPeakInFormatter private () extends InFormatter[Feature, FeatureProduct, FeatureRDD, NarrowPeakInFormatter] { +case class NarrowPeakInFormatter private () extends InFormatter[Feature, FeatureProduct, FeatureDataset, NarrowPeakInFormatter] { protected val companion = NarrowPeakInFormatter /** @@ -56,7 +56,7 @@ case class NarrowPeakInFormatter private () extends InFormatter[Feature, Feature // write the features iter.foreach(f => { - writer.write(FeatureRDD.toNarrowPeak(f)) + writer.write(FeatureDataset.toNarrowPeak(f)) writer.newLine() }) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentDataset.scala similarity index 74% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentDataset.scala index d2bcf0cc49..1e70ab8498 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/FragmentDataset.scala @@ -36,7 +36,7 @@ import org.bdgenomics.adam.rdd.{ JavaSaveArgs } import org.bdgenomics.adam.rdd.read.{ - AlignmentRecordRDD, + AlignmentRecordDataset, BinQualities, MarkDuplicates, QualityScoreBin @@ -79,52 +79,52 @@ private[adam] class FragmentArraySerializer extends IntervalArraySerializer[Refe } /** - * Helper singleton object for building FragmentRDDs. + * Helper singleton object for building FragmentDatasets. */ -object FragmentRDD { +object FragmentDataset { /** * Hadoop configuration path to check for a boolean value indicating whether * the current or original read qualities should be written. True indicates * to write the original qualities. The default is false. */ - val WRITE_ORIGINAL_QUALITIES = "org.bdgenomics.adam.rdd.fragment.FragmentRDD.writeOriginalQualities" + val WRITE_ORIGINAL_QUALITIES = "org.bdgenomics.adam.rdd.fragment.FragmentDataset.writeOriginalQualities" /** * Hadoop configuration path to check for a boolean value indicating whether * to write the "/1" "/2" suffixes to the read name that indicate whether a * read is first or second in a pair. Default is false (no suffixes). */ - val WRITE_SUFFIXES = "org.bdgenomics.adam.rdd.fragment.FragmentRDD.writeSuffixes" + val WRITE_SUFFIXES = "org.bdgenomics.adam.rdd.fragment.FragmentDataset.writeSuffixes" /** - * Creates a FragmentRDD where no record groups or sequence info are attached. + * Creates a FragmentDataset where no record groups or sequence info are attached. * * @param rdd RDD of fragments. - * @return Returns a FragmentRDD with an empty record group dictionary and sequence dictionary. + * @return Returns a FragmentDataset with an empty record group dictionary and sequence dictionary. */ - private[adam] def fromRdd(rdd: RDD[Fragment]): FragmentRDD = { - FragmentRDD(rdd, + private[adam] def fromRdd(rdd: RDD[Fragment]): FragmentDataset = { + FragmentDataset(rdd, SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty) } /** - * Builds a FragmentRDD without a partition map. + * Builds a FragmentDataset without a partition map. * - * @param rdd The underlying Franment RDD. - * @param sequences The sequence dictionary for the RDD. - * @param recordGroupDictionary The record group dictionary for the RDD. + * @param rdd The underlying Fragment RDD. + * @param sequences The sequence dictionary for the genomic dataset. + * @param recordGroupDictionary The record group dictionary for the genomic dataset. * @param processingSteps The processing steps that have been applied to this data. - * @return A new FragmentRDD. + * @return A new FragmentDataset. */ def apply(rdd: RDD[Fragment], sequences: SequenceDictionary, recordGroupDictionary: RecordGroupDictionary, - processingSteps: Seq[ProcessingStep]): FragmentRDD = { + processingSteps: Seq[ProcessingStep]): FragmentDataset = { - new RDDBoundFragmentRDD(rdd, + new RDDBoundFragmentDataset(rdd, sequences, recordGroupDictionary, processingSteps, @@ -132,28 +132,28 @@ object FragmentRDD { } /** - * A genomic RDD that supports Datasets of Fragments. + * A genomic dataset that supports Datasets of Fragments. * * @param ds The underlying Dataset of Fragment data. * @param sequences The genomic sequences this data was aligned to, if any. * @param recordGroups The record groups these Fragments came from. * @param processingSteps The processing steps that have been applied to this data. - * @return A new FragmentRDD. + * @return A new FragmentDataset. */ def apply(ds: Dataset[FragmentProduct], sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, - processingSteps: Seq[ProcessingStep]): FragmentRDD = { - DatasetBoundFragmentRDD(ds, sequences, recordGroups, processingSteps) + processingSteps: Seq[ProcessingStep]): FragmentDataset = { + DatasetBoundFragmentDataset(ds, sequences, recordGroups, processingSteps) } } -case class ParquetUnboundFragmentRDD private[rdd] ( +case class ParquetUnboundFragmentDataset private[rdd] ( @transient private val sc: SparkContext, private val parquetFilename: String, sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, - @transient val processingSteps: Seq[ProcessingStep]) extends FragmentRDD { + @transient val processingSteps: Seq[ProcessingStep]) extends FragmentDataset { lazy val rdd: RDD[Fragment] = { sc.loadParquet(parquetFilename) @@ -168,30 +168,30 @@ case class ParquetUnboundFragmentRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): FragmentRDD = { + newSequences: SequenceDictionary): FragmentDataset = { copy(sequences = newSequences) } def replaceRecordGroups( - newRecordGroups: RecordGroupDictionary): FragmentRDD = { + newRecordGroups: RecordGroupDictionary): FragmentDataset = { copy(recordGroups = newRecordGroups) } def replaceProcessingSteps( - newProcessingSteps: Seq[ProcessingStep]): FragmentRDD = { + newProcessingSteps: Seq[ProcessingStep]): FragmentDataset = { copy(processingSteps = newProcessingSteps) } } -case class DatasetBoundFragmentRDD private[rdd] ( +case class DatasetBoundFragmentDataset private[rdd] ( dataset: Dataset[FragmentProduct], sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, @transient val processingSteps: Seq[ProcessingStep], override val isPartitioned: Boolean = false, override val optPartitionBinSize: Option[Int] = None, - override val optLookbackPartitions: Option[Int] = None) extends FragmentRDD - with DatasetBoundGenomicDataset[Fragment, FragmentProduct, FragmentRDD] { + override val optLookbackPartitions: Option[Int] = None) extends FragmentDataset + with DatasetBoundGenomicDataset[Fragment, FragmentProduct, FragmentDataset] { lazy val rdd = dataset.rdd.map(_.toAvro) @@ -212,35 +212,35 @@ case class DatasetBoundFragmentRDD private[rdd] ( } override def transformDataset( - tFn: Dataset[FragmentProduct] => Dataset[FragmentProduct]): FragmentRDD = { + tFn: Dataset[FragmentProduct] => Dataset[FragmentProduct]): FragmentDataset = { copy(dataset = tFn(dataset)) } def replaceSequences( - newSequences: SequenceDictionary): FragmentRDD = { + newSequences: SequenceDictionary): FragmentDataset = { copy(sequences = newSequences) } def replaceRecordGroups( - newRecordGroups: RecordGroupDictionary): FragmentRDD = { + newRecordGroups: RecordGroupDictionary): FragmentDataset = { copy(recordGroups = newRecordGroups) } def replaceProcessingSteps( - newProcessingSteps: Seq[ProcessingStep]): FragmentRDD = { + newProcessingSteps: Seq[ProcessingStep]): FragmentDataset = { copy(processingSteps = newProcessingSteps) } } -case class RDDBoundFragmentRDD private[rdd] ( +case class RDDBoundFragmentDataset private[rdd] ( rdd: RDD[Fragment], sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, @transient val processingSteps: Seq[ProcessingStep], - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends FragmentRDD { + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends FragmentDataset { /** - * A SQL Dataset of reads. + * A SQL Dataset of fragments. */ lazy val dataset: Dataset[FragmentProduct] = { val sqlContext = SQLContext.getOrCreate(rdd.context) @@ -249,22 +249,22 @@ case class RDDBoundFragmentRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): FragmentRDD = { + newSequences: SequenceDictionary): FragmentDataset = { copy(sequences = newSequences) } def replaceRecordGroups( - newRecordGroups: RecordGroupDictionary): FragmentRDD = { + newRecordGroups: RecordGroupDictionary): FragmentDataset = { copy(recordGroups = newRecordGroups) } def replaceProcessingSteps( - newProcessingSteps: Seq[ProcessingStep]): FragmentRDD = { + newProcessingSteps: Seq[ProcessingStep]): FragmentDataset = { copy(processingSteps = newProcessingSteps) } } -sealed abstract class FragmentRDD extends AvroRecordGroupGenomicDataset[Fragment, FragmentProduct, FragmentRDD] { +sealed abstract class FragmentDataset extends AvroRecordGroupGenomicDataset[Fragment, FragmentProduct, FragmentDataset] { protected val productFn = FragmentProduct.fromAvro(_) protected val unproductFn = (f: FragmentProduct) => f.toAvro @@ -280,37 +280,37 @@ sealed abstract class FragmentRDD extends AvroRecordGroupGenomicDataset[Fragment * Replaces the underlying RDD with a new RDD. * * @param newRdd The RDD to replace our underlying RDD with. - * @return Returns a new FragmentRDD where the underlying RDD has been + * @return Returns a new FragmentDataset where the underlying RDD has been * swapped out. */ protected def replaceRdd(newRdd: RDD[Fragment], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): FragmentRDD = { - RDDBoundFragmentRDD(newRdd, + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): FragmentDataset = { + RDDBoundFragmentDataset(newRdd, sequences, recordGroups, processingSteps, newPartitionMap) } - def union(rdds: FragmentRDD*): FragmentRDD = { - val iterableRdds = rdds.toSeq - FragmentRDD(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), - iterableRdds.map(_.sequences).fold(sequences)(_ ++ _), - iterableRdds.map(_.recordGroups).fold(recordGroups)(_ ++ _), - iterableRdds.map(_.processingSteps).fold(processingSteps)(_ ++ _)) + def union(datasets: FragmentDataset*): FragmentDataset = { + val iterableDatasets = datasets.toSeq + FragmentDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _), + iterableDatasets.map(_.recordGroups).fold(recordGroups)(_ ++ _), + iterableDatasets.map(_.processingSteps).fold(processingSteps)(_ ++ _)) } /** - * Applies a function that transforms the underlying RDD into a new RDD using + * Applies a function that transforms the underlying Dataset into a new Dataset using * the Spark SQL API. * - * @param tFn A function that transforms the underlying RDD as a Dataset. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying Dataset as a Dataset. + * @return A new genomic dataset where the Dataset of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) is copied without modification. */ def transformDataset( - tFn: Dataset[FragmentProduct] => Dataset[FragmentProduct]): FragmentRDD = { - DatasetBoundFragmentRDD(tFn(dataset), + tFn: Dataset[FragmentProduct] => Dataset[FragmentProduct]): FragmentDataset = { + DatasetBoundFragmentDataset(tFn(dataset), sequences, recordGroups, processingSteps) @@ -319,16 +319,16 @@ sealed abstract class FragmentRDD extends AvroRecordGroupGenomicDataset[Fragment /** * Essentially, splits up the reads in a Fragment. * - * @return Returns this RDD converted back to reads. + * @return Returns this genomic dataset converted back to reads. */ - def toReads(): AlignmentRecordRDD = { + def toReads(): AlignmentRecordDataset = { val converter = new AlignmentRecordConverter // convert the fragments to reads val newRdd = rdd.flatMap(converter.convertFragment) // are we aligned? - AlignmentRecordRDD(newRdd, + AlignmentRecordDataset(newRdd, sequences, recordGroups, processingSteps) @@ -337,10 +337,10 @@ sealed abstract class FragmentRDD extends AvroRecordGroupGenomicDataset[Fragment /** * Marks reads as possible fragment duplicates. * - * @return A new RDD where reads have the duplicate read flag set. Duplicate + * @return A new genomic dataset where reads have the duplicate read flag set. Duplicate * reads are NOT filtered out. */ - def markDuplicates(): FragmentRDD = MarkDuplicatesInDriver.time { + def markDuplicates(): FragmentDataset = MarkDuplicatesInDriver.time { replaceRdd(MarkDuplicates(this)) } @@ -363,8 +363,8 @@ sealed abstract class FragmentRDD extends AvroRecordGroupGenomicDataset[Fragment * @param bins The bins to use. * @return Fragments whose quality scores are binned. */ - def binQualityScores(bins: Seq[QualityScoreBin]): FragmentRDD = { - AlignmentRecordRDD.validateBins(bins) + def binQualityScores(bins: Seq[QualityScoreBin]): FragmentDataset = { + AlignmentRecordDataset.validateBins(bins) BinQualities(this, bins) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala index a573c2ab5e..c32a4cb600 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala @@ -29,26 +29,26 @@ import org.bdgenomics.utils.misc.Logging * InFormatter companion that creates an InFormatter that writes interleaved * FASTQ. */ -object InterleavedFASTQInFormatter extends InFormatterCompanion[Fragment, FragmentProduct, FragmentRDD, InterleavedFASTQInFormatter] { +object InterleavedFASTQInFormatter extends InFormatterCompanion[Fragment, FragmentProduct, FragmentDataset, InterleavedFASTQInFormatter] { /** * Builds an InterleavedFASTQInFormatter to write Interleaved FASTQ. * - * @param gRdd GenomicRDD of Fragments. Used to get HadoopConfiguration. + * @param gDataset GenomicDataset of Fragments. Used to get HadoopConfiguration. * @return Returns a new Interleaved FASTQ InFormatter. */ - def apply(gRdd: FragmentRDD): InterleavedFASTQInFormatter = { - new InterleavedFASTQInFormatter(gRdd.rdd.context.hadoopConfiguration) + def apply(gDataset: FragmentDataset): InterleavedFASTQInFormatter = { + new InterleavedFASTQInFormatter(gDataset.rdd.context.hadoopConfiguration) } } class InterleavedFASTQInFormatter private ( - conf: Configuration) extends InFormatter[Fragment, FragmentProduct, FragmentRDD, InterleavedFASTQInFormatter] with Logging { + conf: Configuration) extends InFormatter[Fragment, FragmentProduct, FragmentDataset, InterleavedFASTQInFormatter] with Logging { protected val companion = InterleavedFASTQInFormatter private val converter = new AlignmentRecordConverter - private val writeSuffixes = conf.getBoolean(FragmentRDD.WRITE_SUFFIXES, false) - private val writeOriginalQualities = conf.getBoolean(FragmentRDD.WRITE_ORIGINAL_QUALITIES, false) + private val writeSuffixes = conf.getBoolean(FragmentDataset.WRITE_SUFFIXES, false) + private val writeOriginalQualities = conf.getBoolean(FragmentDataset.WRITE_ORIGINAL_QUALITIES, false) /** * Writes alignment records to an output stream in interleaved FASTQ format. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala index 78f7a2d9e1..5a6374d3a4 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala @@ -28,26 +28,26 @@ import org.bdgenomics.utils.misc.Logging /** * InFormatter companion that creates an InFormatter that writes Bowtie tab5 format. */ -object Tab5InFormatter extends InFormatterCompanion[Fragment, FragmentProduct, FragmentRDD, Tab5InFormatter] { +object Tab5InFormatter extends InFormatterCompanion[Fragment, FragmentProduct, FragmentDataset, Tab5InFormatter] { /** * Builds an Tab5InFormatter to write Bowtie tab5 format. * - * @param gRdd GenomicRDD of Fragments. Used to get HadoopConfiguration. + * @param gDataset GenomicDataset of Fragments. Used to get HadoopConfiguration. * @return Returns a new Tab6InFormatter. */ - def apply(gRdd: FragmentRDD): Tab5InFormatter = { - new Tab5InFormatter(gRdd.rdd.context.hadoopConfiguration) + def apply(gDataset: FragmentDataset): Tab5InFormatter = { + new Tab5InFormatter(gDataset.rdd.context.hadoopConfiguration) } } class Tab5InFormatter private ( - conf: Configuration) extends InFormatter[Fragment, FragmentProduct, FragmentRDD, Tab5InFormatter] with Logging { + conf: Configuration) extends InFormatter[Fragment, FragmentProduct, FragmentDataset, Tab5InFormatter] with Logging { protected val companion = Tab5InFormatter private val newLine = "\n".getBytes private val converter = new AlignmentRecordConverter - private val writeOriginalQualities = conf.getBoolean(FragmentRDD.WRITE_ORIGINAL_QUALITIES, false) + private val writeOriginalQualities = conf.getBoolean(FragmentDataset.WRITE_ORIGINAL_QUALITIES, false) /** * Writes alignment records to an output stream in Bowtie tab5 format. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab6InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab6InFormatter.scala index fc2f3a6979..2d88e2b05d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab6InFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab6InFormatter.scala @@ -28,27 +28,27 @@ import org.bdgenomics.utils.misc.Logging /** * InFormatter companion that creates an InFormatter that writes Bowtie tab6 format. */ -object Tab6InFormatter extends InFormatterCompanion[Fragment, FragmentProduct, FragmentRDD, Tab6InFormatter] { +object Tab6InFormatter extends InFormatterCompanion[Fragment, FragmentProduct, FragmentDataset, Tab6InFormatter] { /** * Builds an Tab6InFormatter to write Bowtie tab6 format. * - * @param gRdd GenomicRDD of Fragments. Used to get HadoopConfiguration. + * @param gDataset GenomicDataset of Fragments. Used to get HadoopConfiguration. * @return Returns a new Tab6InFormatter. */ - def apply(gRdd: FragmentRDD): Tab6InFormatter = { - new Tab6InFormatter(gRdd.rdd.context.hadoopConfiguration) + def apply(gDataset: FragmentDataset): Tab6InFormatter = { + new Tab6InFormatter(gDataset.rdd.context.hadoopConfiguration) } } class Tab6InFormatter private ( - conf: Configuration) extends InFormatter[Fragment, FragmentProduct, FragmentRDD, Tab6InFormatter] with Logging { + conf: Configuration) extends InFormatter[Fragment, FragmentProduct, FragmentDataset, Tab6InFormatter] with Logging { protected val companion = Tab6InFormatter private val newLine = "\n".getBytes private val converter = new AlignmentRecordConverter - private val writeSuffixes = conf.getBoolean(FragmentRDD.WRITE_SUFFIXES, false) - private val writeOriginalQualities = conf.getBoolean(FragmentRDD.WRITE_ORIGINAL_QUALITIES, false) + private val writeSuffixes = conf.getBoolean(FragmentDataset.WRITE_SUFFIXES, false) + private val writeOriginalQualities = conf.getBoolean(FragmentDataset.WRITE_ORIGINAL_QUALITIES, false) /** * Writes alignment records to an output stream in Bowtie tab6 format. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDataset.scala similarity index 85% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDataset.scala index 0b77299365..29453ec0cf 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDataset.scala @@ -44,14 +44,14 @@ import org.bdgenomics.adam.models._ import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd._ import org.bdgenomics.adam.rdd.feature.{ - CoverageRDD, - DatasetBoundCoverageRDD, - RDDBoundCoverageRDD + CoverageDataset, + DatasetBoundCoverageDataset, + RDDBoundCoverageDataset } import org.bdgenomics.adam.rdd.read.realignment.RealignIndels import org.bdgenomics.adam.rdd.read.recalibration.BaseQualityRecalibration -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.variant.VariantRDD +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.variant.VariantDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct } import org.bdgenomics.adam.serialization.AvroSerializer import org.bdgenomics.adam.util.{ FileMerger, ReferenceFile } @@ -92,21 +92,21 @@ private[adam] class AlignmentRecordArraySerializer extends IntervalArraySerializ } } -object AlignmentRecordRDD extends Serializable { +object AlignmentRecordDataset extends Serializable { /** * Hadoop configuration path to check for a boolean value indicating whether * the current or original read qualities should be written. True indicates * to write the original qualities. The default is false. */ - val WRITE_ORIGINAL_QUALITIES = "org.bdgenomics.adam.rdd.read.AlignmentRecordRDD.writeOriginalQualities" + val WRITE_ORIGINAL_QUALITIES = "org.bdgenomics.adam.rdd.read.AlignmentRecordDataset.writeOriginalQualities" /** * Hadoop configuration path to check for a boolean value indicating whether * to write the "/1" "/2" suffixes to the read name that indicate whether a * read is first or second in a pair. Default is false (no suffixes). */ - val WRITE_SUFFIXES = "org.bdgenomics.adam.rdd.read.AlignmentRecordRDD.writeSuffixes" + val WRITE_SUFFIXES = "org.bdgenomics.adam.rdd.read.AlignmentRecordDataset.writeSuffixes" /** * Converts a processing step back to the SAM representation. @@ -127,13 +127,13 @@ object AlignmentRecordRDD extends Serializable { } /** - * Builds an AlignmentRecordRDD for unaligned reads. + * Builds an AlignmentRecordDataset for unaligned reads. * * @param rdd The underlying AlignmentRecord RDD. - * @return A new AlignmentRecordRDD. + * @return A new AlignmentRecordDataset. */ - def unaligned(rdd: RDD[AlignmentRecord]): AlignmentRecordRDD = { - RDDBoundAlignmentRecordRDD(rdd, + def unaligned(rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + RDDBoundAlignmentRecordDataset(rdd, SequenceDictionary.empty, RecordGroupDictionary.empty, Seq.empty, @@ -169,41 +169,49 @@ object AlignmentRecordRDD extends Serializable { } /** - * Builds an AlignmentRecordRDD without a partition map. + * Builds an AlignmentRecordDataset without a partition map from an RDD. * * @param rdd The underlying AlignmentRecord RDD. - * @param sequences The sequence dictionary for the RDD. - * @param recordGroups The record group dictionary for the RDD. - * @return A new AlignmentRecordRDD. + * @param sequences The sequence dictionary for the genomic dataset. + * @param recordGroups The record group dictionary for the genomic dataset. + * @return A new AlignmentRecordDataset. */ def apply(rdd: RDD[AlignmentRecord], sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, - processingSteps: Seq[ProcessingStep]): AlignmentRecordRDD = { - RDDBoundAlignmentRecordRDD(rdd, + processingSteps: Seq[ProcessingStep]): AlignmentRecordDataset = { + RDDBoundAlignmentRecordDataset(rdd, sequences, recordGroups, processingSteps, None) } + /** + * Builds an AlignmentRecordDataset without a partition map from a Dataset. + * + * @param ds The underlying AlignmentRecord Dataset. + * @param sequences The sequence dictionary for the genomic dataset. + * @param recordGroups The record group dictionary for the genomic dataset. + * @return A new AlignmentRecordDataset. + */ def apply(ds: Dataset[AlignmentRecordProduct], sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, - processingSteps: Seq[ProcessingStep]): AlignmentRecordRDD = { - DatasetBoundAlignmentRecordRDD(ds, + processingSteps: Seq[ProcessingStep]): AlignmentRecordDataset = { + DatasetBoundAlignmentRecordDataset(ds, sequences, recordGroups, processingSteps) } } -case class ParquetUnboundAlignmentRecordRDD private[rdd] ( +case class ParquetUnboundAlignmentRecordDataset private[rdd] ( @transient private val sc: SparkContext, private val parquetFilename: String, sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, - @transient val processingSteps: Seq[ProcessingStep]) extends AlignmentRecordRDD { + @transient val processingSteps: Seq[ProcessingStep]) extends AlignmentRecordDataset { lazy val optPartitionMap = sc.extractPartitionMap(parquetFilename) @@ -218,30 +226,30 @@ case class ParquetUnboundAlignmentRecordRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): AlignmentRecordRDD = { + newSequences: SequenceDictionary): AlignmentRecordDataset = { copy(sequences = newSequences) } def replaceRecordGroups( - newRecordGroups: RecordGroupDictionary): AlignmentRecordRDD = { + newRecordGroups: RecordGroupDictionary): AlignmentRecordDataset = { copy(recordGroups = newRecordGroups) } def replaceProcessingSteps( - newProcessingSteps: Seq[ProcessingStep]): AlignmentRecordRDD = { + newProcessingSteps: Seq[ProcessingStep]): AlignmentRecordDataset = { copy(processingSteps = newProcessingSteps) } } -case class DatasetBoundAlignmentRecordRDD private[rdd] ( +case class DatasetBoundAlignmentRecordDataset private[rdd] ( dataset: Dataset[AlignmentRecordProduct], sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, @transient val processingSteps: Seq[ProcessingStep], override val isPartitioned: Boolean = true, override val optPartitionBinSize: Option[Int] = Some(1000000), - override val optLookbackPartitions: Option[Int] = Some(1)) extends AlignmentRecordRDD - with DatasetBoundGenomicDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { + override val optLookbackPartitions: Option[Int] = Some(1)) extends AlignmentRecordDataset + with DatasetBoundGenomicDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { lazy val rdd = dataset.rdd.map(_.toAvro) @@ -262,68 +270,68 @@ case class DatasetBoundAlignmentRecordRDD private[rdd] ( } override def transformDataset( - tFn: Dataset[AlignmentRecordProduct] => Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { + tFn: Dataset[AlignmentRecordProduct] => Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { copy(dataset = tFn(dataset)) } def replaceSequences( - newSequences: SequenceDictionary): AlignmentRecordRDD = { + newSequences: SequenceDictionary): AlignmentRecordDataset = { copy(sequences = newSequences) } def replaceRecordGroups( - newRecordGroups: RecordGroupDictionary): AlignmentRecordRDD = { + newRecordGroups: RecordGroupDictionary): AlignmentRecordDataset = { copy(recordGroups = newRecordGroups) } def replaceProcessingSteps( - newProcessingSteps: Seq[ProcessingStep]): AlignmentRecordRDD = { + newProcessingSteps: Seq[ProcessingStep]): AlignmentRecordDataset = { copy(processingSteps = newProcessingSteps) } - override def filterByMapq(minimumMapq: Int): AlignmentRecordRDD = { + override def filterByMapq(minimumMapq: Int): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(dataset.col("mapq") >= minimumMapq)) } - override def filterUnalignedReads(): AlignmentRecordRDD = { + override def filterUnalignedReads(): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(dataset.col("readMapped"))) } - override def filterUnpairedReads(): AlignmentRecordRDD = { + override def filterUnpairedReads(): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(dataset.col("readPaired"))) } - override def filterDuplicateReads(): AlignmentRecordRDD = { + override def filterDuplicateReads(): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(!dataset.col("duplicateRead"))) } - override def filterToPrimaryAlignments(): AlignmentRecordRDD = { + override def filterToPrimaryAlignments(): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(dataset.col("primaryAlignment"))) } - override def filterToRecordGroup(recordGroupName: String): AlignmentRecordRDD = { + override def filterToRecordGroup(recordGroupName: String): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(dataset.col("recordGroupName") === recordGroupName)) } - override def filterToRecordGroups(recordGroupNames: Seq[String]): AlignmentRecordRDD = { + override def filterToRecordGroups(recordGroupNames: Seq[String]): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(dataset.col("recordGroupName") isin (recordGroupNames: _*))) } - override def filterToSample(recordGroupSample: String): AlignmentRecordRDD = { + override def filterToSample(recordGroupSample: String): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(dataset.col("recordGroupSample") === recordGroupSample)) } - override def filterToSamples(recordGroupSamples: Seq[String]): AlignmentRecordRDD = { + override def filterToSamples(recordGroupSamples: Seq[String]): AlignmentRecordDataset = { transformDataset(dataset => dataset.filter(dataset.col("recordGroupSample") isin (recordGroupSamples: _*))) } } -case class RDDBoundAlignmentRecordRDD private[rdd] ( +case class RDDBoundAlignmentRecordDataset private[rdd] ( rdd: RDD[AlignmentRecord], sequences: SequenceDictionary, recordGroups: RecordGroupDictionary, @transient val processingSteps: Seq[ProcessingStep], - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends AlignmentRecordRDD { + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends AlignmentRecordDataset { /** * A SQL Dataset of reads. @@ -334,7 +342,7 @@ case class RDDBoundAlignmentRecordRDD private[rdd] ( sqlContext.createDataset(rdd.map(AlignmentRecordProduct.fromAvro)) } - override def toCoverage(): CoverageRDD = { + override def toCoverage(): CoverageDataset = { val covCounts = rdd.filter(r => { val readMapped = r.getReadMapped @@ -352,21 +360,21 @@ case class RDDBoundAlignmentRecordRDD private[rdd] ( }).reduceByKey(_ + _) .map(r => Coverage(r._1._2, r._2.toDouble, Option(r._1._1))) - RDDBoundCoverageRDD(covCounts, sequences, recordGroups.toSamples, None) + RDDBoundCoverageDataset(covCounts, sequences, recordGroups.toSamples, None) } def replaceSequences( - newSequences: SequenceDictionary): AlignmentRecordRDD = { + newSequences: SequenceDictionary): AlignmentRecordDataset = { copy(sequences = newSequences) } def replaceRecordGroups( - newRecordGroups: RecordGroupDictionary): AlignmentRecordRDD = { + newRecordGroups: RecordGroupDictionary): AlignmentRecordDataset = { copy(recordGroups = newRecordGroups) } def replaceProcessingSteps( - newProcessingSteps: Seq[ProcessingStep]): AlignmentRecordRDD = { + newProcessingSteps: Seq[ProcessingStep]): AlignmentRecordDataset = { copy(processingSteps = newProcessingSteps) } } @@ -374,7 +382,7 @@ case class RDDBoundAlignmentRecordRDD private[rdd] ( private case class AlignmentWindow(contigName: String, start: Long, end: Long, sampleId: String) { } -sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD] { +sealed abstract class AlignmentRecordDataset extends AvroRecordGroupGenomicDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { protected val productFn = AlignmentRecordProduct.fromAvro(_) protected val unproductFn = (a: AlignmentRecordProduct) => a.toAvro @@ -382,16 +390,16 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A @transient val uTag: TypeTag[AlignmentRecordProduct] = typeTag[AlignmentRecordProduct] /** - * Applies a function that transforms the underlying RDD into a new RDD using + * Applies a function that transforms the underlying Dataset into a new Dataset using * the Spark SQL API. * - * @param tFn A function that transforms the underlying RDD as a Dataset. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying Dataset as a Dataset. + * @return A new genomic dataset where the Dataset of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) is copied without modification. */ def transformDataset( - tFn: Dataset[AlignmentRecordProduct] => Dataset[AlignmentRecordProduct]): AlignmentRecordRDD = { - DatasetBoundAlignmentRecordRDD(dataset, + tFn: Dataset[AlignmentRecordProduct] => Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + DatasetBoundAlignmentRecordDataset(dataset, sequences, recordGroups, processingSteps) @@ -403,12 +411,12 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * * @param newRdd New RDD to replace current RDD. * @param newSequences New sequence dictionary to replace current dictionary. - * @return Returns a new AlignmentRecordRDD. + * @return Returns a new AlignmentRecordDataset. */ protected def replaceRddAndSequences(newRdd: RDD[AlignmentRecord], newSequences: SequenceDictionary, - partitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): AlignmentRecordRDD = { - RDDBoundAlignmentRecordRDD(newRdd, + partitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): AlignmentRecordDataset = { + RDDBoundAlignmentRecordDataset(newRdd, newSequences, recordGroups, processingSteps, @@ -416,8 +424,8 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A } protected def replaceRdd(newRdd: RDD[AlignmentRecord], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): AlignmentRecordRDD = { - RDDBoundAlignmentRecordRDD(newRdd, + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): AlignmentRecordDataset = { + RDDBoundAlignmentRecordDataset(newRdd, sequences, recordGroups, processingSteps, @@ -429,22 +437,22 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A IntervalArray(rdd, AlignmentRecordArray.apply(_, _)) } - def union(rdds: AlignmentRecordRDD*): AlignmentRecordRDD = { - val iterableRdds = rdds.toSeq - AlignmentRecordRDD(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), - iterableRdds.map(_.sequences).fold(sequences)(_ ++ _), - iterableRdds.map(_.recordGroups).fold(recordGroups)(_ ++ _), - iterableRdds.map(_.processingSteps).fold(processingSteps)(_ ++ _)) + def union(datasets: AlignmentRecordDataset*): AlignmentRecordDataset = { + val iterableDatasets = datasets.toSeq + AlignmentRecordDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _), + iterableDatasets.map(_.recordGroups).fold(recordGroups)(_ ++ _), + iterableDatasets.map(_.processingSteps).fold(processingSteps)(_ ++ _)) } /** * Convert this set of reads into fragments. * - * @return Returns a FragmentRDD where all reads have been grouped together by + * @return Returns a FragmentDataset where all reads have been grouped together by * the original sequence fragment they come from. */ - def toFragments(): FragmentRDD = { - FragmentRDD(groupReadsByFragment().map(_.toFragment), + def toFragments(): FragmentDataset = { + FragmentDataset(groupReadsByFragment().map(_.toFragment), sequences, recordGroups, processingSteps) @@ -464,22 +472,22 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * * Assumes that reads are sorted by readname. * - * @return Returns a FragmentRDD where all reads have been grouped together by + * @return Returns a FragmentDataset where all reads have been grouped together by * the original sequence fragment they come from. */ - private[rdd] def querynameSortedToFragments: FragmentRDD = { - FragmentRDD(locallyGroupReadsByFragment().map(_.toFragment), + private[rdd] def querynameSortedToFragments: FragmentDataset = { + FragmentDataset(locallyGroupReadsByFragment().map(_.toFragment), sequences, recordGroups, processingSteps) } /** - * Converts this set of reads into a corresponding CoverageRDD. + * Converts this set of reads into a corresponding CoverageDataset. * - * @return CoverageRDD containing mapped RDD of Coverage. + * @return CoverageDataset containing mapped genomic dataset of Coverage. */ - def toCoverage(): CoverageRDD = { + def toCoverage(): CoverageDataset = { import dataset.sqlContext.implicits._ val covCounts = dataset.toDF .where($"readMapped") @@ -505,7 +513,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A .withColumnRenamed("sum(count)", "count") .as[Coverage] - DatasetBoundCoverageRDD(covCounts, sequences, recordGroups.toSamples) + DatasetBoundCoverageDataset(covCounts, sequences, recordGroups.toSamples) } /** @@ -524,7 +532,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A } /** - * Saves this RDD as BAM, CRAM, or SAM if the extension provided is .sam, .cram, + * Saves this genomic dataset as BAM, CRAM, or SAM if the extension provided is .sam, .cram, * or .bam. * * @param args Arguments defining where to save the file. @@ -554,7 +562,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A } /** - * Saves the RDD as FASTQ if the file has the proper extension. + * Saves this genomic dataset as FASTQ if the file has the proper extension. * * @param args Save arguments defining the file path to save at. * @return True if the file extension ended in ".fq" or ".fastq" and the file @@ -594,7 +602,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A } /** - * Saves this RDD to disk, with the type identified by the extension. + * Saves this genomic dataset to disk, with the type identified by the extension. * * @param filePath Path to save the file at. * @param isSorted Whether the file is sorted or not. @@ -606,13 +614,13 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A } /** - * Converts an RDD into the SAM spec string it represents. + * Converts a genomic dataset into the SAM spec string it represents. * - * This method converts an RDD of AlignmentRecords back to an RDD of + * This method converts a genomic dataset of AlignmentRecords back to an RDD of * SAMRecordWritables and a SAMFileHeader, and then maps this RDD into a * string on the driver that represents this file in SAM. * - * @return A string on the driver representing this RDD of reads in SAM format. + * @return A string on the driver representing this genomic dataset of reads in SAM format. */ def saveAsSamString(): String = { @@ -674,7 +682,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A // get program records and attach to header val pgRecords = processingSteps.map(r => { - AlignmentRecordRDD.processingStepToSam(r) + AlignmentRecordDataset.processingStepToSam(r) }) header.setProgramRecords(pgRecords) @@ -743,7 +751,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A } /** - * Saves an RDD of ADAM read data into the SAM/BAM format. + * Saves this genomic dataset of ADAM read data into the SAM/BAM format. * * @param filePath Path to save files to. * @param asType Selects whether to save as SAM, BAM, or CRAM. The default @@ -920,7 +928,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A } /** - * Saves this RDD to disk as a SAM/BAM/CRAM file. + * Saves this genomic dataset to disk as a SAM/BAM/CRAM file. * * @param filePath Path to save the file at. * @param asType The SAMFormat to save as. If left null, we will infer the @@ -944,7 +952,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * * @return Returns a new RDD containing sorted reads. */ - def sortReadsByReadName(): AlignmentRecordRDD = SortReads.time { + def sortReadsByReadName(): AlignmentRecordDataset = SortReads.time { log.info("Sorting reads by read name") transformDataset(_.orderBy("readName", "readInFragment")) @@ -957,11 +965,11 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * put at the end and sorted by read name. Contigs are ordered * lexicographically. * - * @return Returns a new RDD containing sorted reads. + * @return Returns a new genomic dataset containing sorted reads. * * @see sortReadsByReferencePositionAndIndex */ - def sortReadsByReferencePosition(): AlignmentRecordRDD = SortReads.time { + def sortReadsByReferencePosition(): AlignmentRecordDataset = SortReads.time { log.info("Sorting reads by reference position") // NOTE: In order to keep unmapped reads from swamping a single partition @@ -984,11 +992,11 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * put at the end and sorted by read name. Contigs are ordered by index * that they are ordered in the SequenceDictionary. * - * @return Returns a new RDD containing sorted reads. + * @return Returns a new genomic dataset containing sorted reads. * * @see sortReadsByReferencePosition */ - def sortReadsByReferencePositionAndIndex(): AlignmentRecordRDD = SortByIndex.time { + def sortReadsByReferencePositionAndIndex(): AlignmentRecordDataset = SortByIndex.time { log.info("Sorting reads by reference index, using %s.".format(sequences)) import scala.math.Ordering.{ Int => ImplicitIntOrdering, _ } @@ -1017,10 +1025,10 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A /** * Marks reads as possible fragment duplicates. * - * @return A new RDD where reads have the duplicate read flag set. Duplicate + * @return A new genomic dataset where reads have the duplicate read flag set. Duplicate * reads are NOT filtered out. */ - def markDuplicates(): AlignmentRecordRDD = MarkDuplicatesInDriver.time { + def markDuplicates(): AlignmentRecordDataset = MarkDuplicatesInDriver.time { replaceRdd(MarkDuplicates(this)) } @@ -1034,12 +1042,12 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @param minAcceptableQuality The minimum quality score to recalibrate. * @param storageLevel An optional storage level to set for the output * of the first stage of BQSR. Set to null to omit. - * @return Returns an RDD of recalibrated reads. + * @return Returns a genomic dataset of recalibrated reads. */ def recalibrateBaseQualities( - knownSnps: VariantRDD, + knownSnps: VariantDataset, minAcceptableQuality: java.lang.Integer, - storageLevel: StorageLevel): AlignmentRecordRDD = { + storageLevel: StorageLevel): AlignmentRecordDataset = { val snpTable = SnpTable(knownSnps) val bcastSnps = rdd.context.broadcast(snpTable) val sMinQual: Int = minAcceptableQuality @@ -1059,14 +1067,14 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @param optSamplingFraction An optional fraction of reads to sample when * generating the covariate table. * @param optSamplingSeed An optional seed to provide if downsampling reads. - * @return Returns an RDD of recalibrated reads. + * @return Returns a genomic dataset of recalibrated reads. */ def recalibrateBaseQualities( knownSnps: Broadcast[SnpTable], minAcceptableQuality: Int = 5, optStorageLevel: Option[StorageLevel] = Some(StorageLevel.MEMORY_ONLY), optSamplingFraction: Option[Double] = None, - optSamplingSeed: Option[Long] = None): AlignmentRecordRDD = BQSRInDriver.time { + optSamplingSeed: Option[Long] = None): AlignmentRecordDataset = BQSRInDriver.time { replaceRdd(BaseQualityRecalibration(rdd, knownSnps, recordGroups, @@ -1092,7 +1100,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * are only finalized if the log-odds threshold is exceeded. * @param maxTargetSize The maximum width of a single target region for * realignment. - * @return Returns an RDD of mapped reads which have been realigned. + * @return Returns a genomic dataset of mapped reads which have been realigned. */ def realignIndels( consensusModel: ConsensusGenerator, @@ -1100,7 +1108,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A maxIndelSize: java.lang.Integer, maxConsensusNumber: java.lang.Integer, lodThreshold: java.lang.Double, - maxTargetSize: java.lang.Integer): AlignmentRecordRDD = { + maxTargetSize: java.lang.Integer): AlignmentRecordDataset = { replaceRdd(RealignIndels(rdd, consensusModel, isSorted: Boolean, @@ -1127,7 +1135,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * will be inferred from MD tags. * @param unclipReads If true, unclips reads prior to realignment. Else, * omits clipped bases during realignment. - * @return Returns an RDD of mapped reads which have been realigned. + * @return Returns a genomic dataset of mapped reads which have been realigned. */ def realignIndels( consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads, @@ -1138,7 +1146,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A maxTargetSize: Int = 3000, maxReadsPerTarget: Int = 20000, optReferenceFile: Option[ReferenceFile] = None, - unclipReads: Boolean = false): AlignmentRecordRDD = RealignIndelsInDriver.time { + unclipReads: Boolean = false): AlignmentRecordDataset = RealignIndelsInDriver.time { replaceRdd(RealignIndels(rdd, consensusModel = consensusModel, dataIsSorted = isSorted, @@ -1162,13 +1170,13 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * find that the MD tag that was previously on the read doesn't match our * new tag, LENIENT will log a warning message, STRICT will throw an * exception, and SILENT will ignore. Default is LENIENT. - * @return Returns a new AlignmentRecordRDD where all reads have the + * @return Returns a new AlignmentRecordDataset where all reads have the * mismatchingPositions field populated. */ def computeMismatchingPositions( referenceFile: ReferenceFile, overwriteExistingTags: Boolean = false, - validationStringency: ValidationStringency = ValidationStringency.LENIENT): AlignmentRecordRDD = { + validationStringency: ValidationStringency = ValidationStringency.LENIENT): AlignmentRecordDataset = { replaceRdd(MDTagging(rdd, referenceFile, overwriteExistingTags = overwriteExistingTags, @@ -1212,7 +1220,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @param disableFastConcat If asSingleFile is true, disables the use of the * parallel file merging engine. * @param validationStringency Iff strict, throw an exception if any read in - * this RDD is not accompanied by its mate. + * this genomic dataset is not accompanied by its mate. * @param persistLevel The persistence level to cache reads at between passes. */ def saveAsPairedFastq( @@ -1250,7 +1258,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @param disableFastConcat If asSingleFile is true, disables the use of the * parallel file merging engine. * @param validationStringency Iff strict, throw an exception if any read in - * this RDD is not accompanied by its mate. + * this genomic dataset is not accompanied by its mate. * @param persistLevel An optional persistance level to set. If this level is * set, then reads will be cached (at the given persistance) level between * passes. @@ -1389,7 +1397,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @param disableFastConcat If asSingleFile is true, disables the use of the * parallel file merging engine. * @param validationStringency Iff strict, throw an exception if any read in - * this RDD is not accompanied by its mate. + * this genomic dataset is not accompanied by its mate. */ def saveAsFastq( fileName: String, @@ -1425,7 +1433,7 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @param disableFastConcat If asSingleFile is true, disables the use of the * parallel file merging engine. * @param validationStringency Iff strict, throw an exception if any read in - * this RDD is not accompanied by its mate. + * this genomic dataset is not accompanied by its mate. * @param persistLevel An optional persistance level to set. If this level is * set, then reads will be cached (at the given persistance) level between * passes. @@ -1481,11 +1489,11 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @note The RDD that this is called on should be the RDD with the first read from the pair. * @param secondPairRdd The rdd containing the second read from the pairs. * @param validationStringency How stringently to validate the reads. - * @return Returns an RDD with the pair information recomputed. + * @return Returns a genomic dataset with the pair information recomputed. */ def reassembleReadPairs( secondPairRdd: JavaRDD[AlignmentRecord], - validationStringency: ValidationStringency): AlignmentRecordRDD = { + validationStringency: ValidationStringency): AlignmentRecordDataset = { reassembleReadPairs(secondPairRdd.rdd, validationStringency = validationStringency) } @@ -1497,11 +1505,11 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @note The RDD that this is called on should be the RDD with the first read from the pair. * @param secondPairRdd The rdd containing the second read from the pairs. * @param validationStringency How stringently to validate the reads. - * @return Returns an RDD with the pair information recomputed. + * @return Returns a genomic dataset with the pair information recomputed. */ def reassembleReadPairs( secondPairRdd: RDD[AlignmentRecord], - validationStringency: ValidationStringency = ValidationStringency.LENIENT): AlignmentRecordRDD = { + validationStringency: ValidationStringency = ValidationStringency.LENIENT): AlignmentRecordDataset = { // cache rdds val firstPairRdd = rdd.cache() secondPairRdd.cache() @@ -1567,18 +1575,18 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A * @param bins The bins to use. * @return Reads whose quality scores are binned. */ - def binQualityScores(bins: Seq[QualityScoreBin]): AlignmentRecordRDD = { - AlignmentRecordRDD.validateBins(bins) + def binQualityScores(bins: Seq[QualityScoreBin]): AlignmentRecordDataset = { + AlignmentRecordDataset.validateBins(bins) BinQualities(this, bins) } /** * Left normalizes the INDELs in reads containing INDELs. * - * @return Returns a new RDD where the reads that contained INDELs have their + * @return Returns a new genomic dataset where the reads that contained INDELs have their * INDELs left normalized. */ - def leftNormalizeIndels(): AlignmentRecordRDD = { + def leftNormalizeIndels(): AlignmentRecordDataset = { transform(rdd => { rdd.map(r => { if (!r.getReadMapped || r.getCigar == null) { @@ -1601,88 +1609,88 @@ sealed abstract class AlignmentRecordRDD extends AvroRecordGroupGenomicDataset[A } /** - * Filter this AlignmentRecordRDD by mapping quality. + * Filter this AlignmentRecordDataset by mapping quality. * * @param minimumMapq Minimum mapping quality to filter by, inclusive. - * @return AlignmentRecordRDD filtered by mapping quality. + * @return AlignmentRecordDataset filtered by mapping quality. */ - def filterByMapq(minimumMapq: Int): AlignmentRecordRDD = { + def filterByMapq(minimumMapq: Int): AlignmentRecordDataset = { transform(rdd => rdd.filter(g => Option(g.getMapq).exists(_ >= minimumMapq))) } /** - * Filter unaligned reads from this AlignmentRecordRDD. + * Filter unaligned reads from this AlignmentRecordDataset. * - * @return AlignmentRecordRDD filtered to remove unaligned reads. + * @return AlignmentRecordDataset filtered to remove unaligned reads. */ - def filterUnalignedReads(): AlignmentRecordRDD = { + def filterUnalignedReads(): AlignmentRecordDataset = { transform(rdd => rdd.filter(_.getReadMapped)) } /** - * Filter unpaired reads from this AlignmentRecordRDD. + * Filter unpaired reads from this AlignmentRecordDataset. * - * @return AlignmentRecordRDD filtered to remove unpaired reads. + * @return AlignmentRecordDataset filtered to remove unpaired reads. */ - def filterUnpairedReads(): AlignmentRecordRDD = { + def filterUnpairedReads(): AlignmentRecordDataset = { transform(rdd => rdd.filter(_.getReadPaired)) } /** - * Filter duplicate reads from this AlignmentRecordRDD. + * Filter duplicate reads from this AlignmentRecordDataset. * - * @return AlignmentRecordRDD filtered to remove duplicate reads. + * @return AlignmentRecordDataset filtered to remove duplicate reads. */ - def filterDuplicateReads(): AlignmentRecordRDD = { + def filterDuplicateReads(): AlignmentRecordDataset = { transform(rdd => rdd.filter(!_.getDuplicateRead)) } /** - * Filter this AlignmentRecordRDD to include only primary alignments. + * Filter this AlignmentRecordDataset to include only primary alignments. * - * @return AlignmentRecordRDD filtered to include only primary alignments. + * @return AlignmentRecordDataset filtered to include only primary alignments. */ - def filterToPrimaryAlignments(): AlignmentRecordRDD = { + def filterToPrimaryAlignments(): AlignmentRecordDataset = { transform(rdd => rdd.filter(_.getPrimaryAlignment)) } /** - * Filter this AlignmentRecordRDD by record group to those that match the specified record group. + * Filter this AlignmentRecordDataset by record group to those that match the specified record group. * * @param recordGroupName Record group to filter by. - * @return AlignmentRecordRDD filtered by record group. + * @return AlignmentRecordDataset filtered by record group. */ - def filterToRecordGroup(recordGroupName: String): AlignmentRecordRDD = { + def filterToRecordGroup(recordGroupName: String): AlignmentRecordDataset = { transform(rdd => rdd.filter(g => Option(g.getRecordGroupName).exists(_ == recordGroupName))) } /** - * Filter this AlignmentRecordRDD by record group to those that match the specified record groups. + * Filter this AlignmentRecordDataset by record group to those that match the specified record groups. * * @param recordGroupNames Sequence of record groups to filter by. - * @return AlignmentRecordRDD filtered by one or more record groups. + * @return AlignmentRecordDataset filtered by one or more record groups. */ - def filterToRecordGroups(recordGroupNames: Seq[String]): AlignmentRecordRDD = { + def filterToRecordGroups(recordGroupNames: Seq[String]): AlignmentRecordDataset = { transform(rdd => rdd.filter(g => Option(g.getRecordGroupName).exists(recordGroupNames.contains(_)))) } /** - * Filter this AlignmentRecordRDD by sample to those that match the specified sample. + * Filter this AlignmentRecordDataset by sample to those that match the specified sample. * * @param recordGroupSample Sample to filter by. - * @return AlignmentRecordRDD filtered by the specified sample. + * @return AlignmentRecordDataset filtered by the specified sample. */ - def filterToSample(recordGroupSample: String): AlignmentRecordRDD = { + def filterToSample(recordGroupSample: String): AlignmentRecordDataset = { transform(rdd => rdd.filter(g => Option(g.getRecordGroupSample).exists(_ == recordGroupSample))) } /** - * Filter this AlignmentRecordRDD by sample to those that match the specified samples. + * Filter this AlignmentRecordDataset by sample to those that match the specified samples. * * @param recordGroupSamples Sequence of samples to filter by. - * @return AlignmentRecordRDD filtered by the specified samples. + * @return AlignmentRecordDataset filtered by the specified samples. */ - def filterToSamples(recordGroupSamples: Seq[String]): AlignmentRecordRDD = { + def filterToSamples(recordGroupSamples: Seq[String]): AlignmentRecordDataset = { transform(rdd => rdd.filter(g => Option(g.getRecordGroupSample).exists(recordGroupSamples.contains(_)))) } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala index d552360b35..f782f88608 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala @@ -34,28 +34,28 @@ import org.bdgenomics.formats.avro.AlignmentRecord * * @tparam T The type of the underlying InFormatter. */ -trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterCompanion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, T] { +trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterCompanion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, T] { protected def makeFormatter(header: SAMFileHeaderWritable, recordGroups: RecordGroupDictionary, converter: AlignmentRecordConverter): T /** - * Makes an AnySAMInFormatter from a GenomicRDD of AlignmentRecords. + * Makes an AnySAMInFormatter from a GenomicDataset of AlignmentRecords. * - * @param gRdd AlignmentRecordRDD with reference build and record group info. + * @param gDataset AlignmentRecordDataset with reference build and record group info. * @return Returns an InFormatter that extends AnySAMInFormatter. */ - def apply(gRdd: AlignmentRecordRDD): T = { + def apply(gDataset: AlignmentRecordDataset): T = { // make a converter val arc = new AlignmentRecordConverter // build a header and set the sort order - val header = arc.createSAMHeader(gRdd.sequences, gRdd.recordGroups) + val header = arc.createSAMHeader(gDataset.sequences, gDataset.recordGroups) header.setSortOrder(SAMFileHeader.SortOrder.coordinate) // construct the in formatter - makeFormatter(SAMFileHeaderWritable(header), gRdd.recordGroups, arc) + makeFormatter(SAMFileHeaderWritable(header), gDataset.recordGroups, arc) } } @@ -64,7 +64,7 @@ trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterC * * @tparam T The recursive type of the class that implements this trait. */ -trait AnySAMInFormatter[T <: AnySAMInFormatter[T]] extends InFormatter[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, T] { +trait AnySAMInFormatter[T <: AnySAMInFormatter[T]] extends InFormatter[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, T] { /** * A serializable form of the SAM File Header. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BinQualities.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BinQualities.scala index 64511588e4..221e974912 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BinQualities.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BinQualities.scala @@ -17,7 +17,7 @@ */ package org.bdgenomics.adam.rdd.read -import org.bdgenomics.adam.rdd.fragment.FragmentRDD +import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.formats.avro.{ AlignmentRecord, Fragment } import scala.collection.JavaConversions._ @@ -118,8 +118,8 @@ private[rdd] object BinQualities extends Serializable { * @return Returns a new RDD of reads were the quality scores of the read * bases have been binned. */ - def apply(reads: AlignmentRecordRDD, - bins: Seq[QualityScoreBin]): AlignmentRecordRDD = { + def apply(reads: AlignmentRecordDataset, + bins: Seq[QualityScoreBin]): AlignmentRecordDataset = { reads.transform(rdd => { rdd.map(binRead(_, bins)) @@ -134,8 +134,8 @@ private[rdd] object BinQualities extends Serializable { * @return Returns a new RDD of fragments were the quality scores of the fragment * bases have been binned. */ - def apply(fragments: FragmentRDD, - bins: Seq[QualityScoreBin]): FragmentRDD = { + def apply(fragments: FragmentDataset, + bins: Seq[QualityScoreBin]): FragmentDataset = { fragments.transform(rdd => { rdd.map(binFragment(_, bins)) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FASTQInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FASTQInFormatter.scala index 31295160b5..b70fc4379c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FASTQInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FASTQInFormatter.scala @@ -21,7 +21,7 @@ import java.io.OutputStream import org.apache.hadoop.conf.Configuration import org.bdgenomics.adam.converters.AlignmentRecordConverter import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD +import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct } import org.bdgenomics.formats.avro.AlignmentRecord import org.bdgenomics.utils.misc.Logging @@ -29,26 +29,26 @@ import org.bdgenomics.utils.misc.Logging /** * InFormatter companion that creates an InFormatter that writes FASTQ. */ -object FASTQInFormatter extends InFormatterCompanion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, FASTQInFormatter] { +object FASTQInFormatter extends InFormatterCompanion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, FASTQInFormatter] { /** * Builds an FASTQInFormatter to write FASTQ. * - * @param gRdd GenomicRDD of AlignmentRecords. Used to get HadoopConfiguration. + * @param gDataset GenomicDataset of AlignmentRecords. Used to get HadoopConfiguration. * @return Returns a new Single FASTQ InFormatter. */ - def apply(gRdd: AlignmentRecordRDD): FASTQInFormatter = { - new FASTQInFormatter(gRdd.rdd.context.hadoopConfiguration) + def apply(gDataset: AlignmentRecordDataset): FASTQInFormatter = { + new FASTQInFormatter(gDataset.rdd.context.hadoopConfiguration) } } class FASTQInFormatter private ( - conf: Configuration) extends InFormatter[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, FASTQInFormatter] with Logging { + conf: Configuration) extends InFormatter[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, FASTQInFormatter] with Logging { protected val companion = FASTQInFormatter private val converter = new AlignmentRecordConverter - private val writeSuffixes = conf.getBoolean(AlignmentRecordRDD.WRITE_SUFFIXES, false) - private val writeOriginalQualities = conf.getBoolean(AlignmentRecordRDD.WRITE_ORIGINAL_QUALITIES, false) + private val writeSuffixes = conf.getBoolean(AlignmentRecordDataset.WRITE_SUFFIXES, false) + private val writeOriginalQualities = conf.getBoolean(AlignmentRecordDataset.WRITE_ORIGINAL_QUALITIES, false) /** * Writes alignment records to an output stream in FASTQ format. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala index c4480d6c1b..32d6b1364d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala @@ -25,7 +25,7 @@ import org.bdgenomics.adam.models.{ ReferencePosition } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.fragment.FragmentRDD +import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.formats.avro.{ AlignmentRecord, Fragment } private[rdd] object MarkDuplicates extends Serializable with Logging { @@ -63,13 +63,13 @@ private[rdd] object MarkDuplicates extends Serializable with Logging { }) } - def apply(rdd: AlignmentRecordRDD): RDD[AlignmentRecord] = { + def apply(rdd: AlignmentRecordDataset): RDD[AlignmentRecord] = { markBuckets(rdd.groupReadsByFragment(), rdd.recordGroups) .flatMap(_.allReads) } - def apply(rdd: FragmentRDD): RDD[Fragment] = { + def apply(rdd: FragmentDataset): RDD[Fragment] = { markBuckets(rdd.rdd.map(f => SingleReadBucket(f)), rdd.recordGroups) .map(_.toFragment) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeDataset.scala similarity index 73% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeDataset.scala index 10f1a5993a..6cb4d94d13 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/GenotypeDataset.scala @@ -82,49 +82,49 @@ private[adam] class GenotypeArraySerializer extends IntervalArraySerializer[Refe } } -object GenotypeRDD extends Serializable { +object GenotypeDataset extends Serializable { /** - * An RDD containing genotypes called in a set of samples against a given + * An genomic dataset containing genotypes called in a set of samples against a given * reference genome. * * @param rdd Called genotypes. * @param sequences A dictionary describing the reference genome. * @param samples The samples called. * @param headerLines The VCF header lines that cover all INFO/FORMAT fields - * needed to represent this RDD of Genotypes. + * needed to represent this genomic dataset of Genotypes. */ def apply(rdd: RDD[Genotype], sequences: SequenceDictionary, samples: Iterable[Sample], - headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines): GenotypeRDD = { - RDDBoundGenotypeRDD(rdd, sequences, samples.toSeq, headerLines, None) + headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines): GenotypeDataset = { + RDDBoundGenotypeDataset(rdd, sequences, samples.toSeq, headerLines, None) } /** - * An RDD containing genotypes called in a set of samples against a given + * An genomic dataset containing genotypes called in a set of samples against a given * reference genome, populated from a SQL Dataset. * * @param ds Called genotypes. * @param sequences A dictionary describing the reference genome. * @param samples The samples called. * @param headerLines The VCF header lines that cover all INFO/FORMAT fields - * needed to represent this RDD of Genotypes. + * needed to represent this genomic dataset of Genotypes. */ def apply(ds: Dataset[GenotypeProduct], sequences: SequenceDictionary, samples: Iterable[Sample], - headerLines: Seq[VCFHeaderLine]): GenotypeRDD = { - DatasetBoundGenotypeRDD(ds, sequences, samples.toSeq, headerLines) + headerLines: Seq[VCFHeaderLine]): GenotypeDataset = { + DatasetBoundGenotypeDataset(ds, sequences, samples.toSeq, headerLines) } } -case class ParquetUnboundGenotypeRDD private[rdd] ( +case class ParquetUnboundGenotypeDataset private[rdd] ( @transient private val sc: SparkContext, private val parquetFilename: String, sequences: SequenceDictionary, @transient samples: Seq[Sample], - @transient headerLines: Seq[VCFHeaderLine]) extends GenotypeRDD { + @transient headerLines: Seq[VCFHeaderLine]) extends GenotypeDataset { protected lazy val optPartitionMap = sc.extractPartitionMap(parquetFilename) @@ -139,28 +139,28 @@ case class ParquetUnboundGenotypeRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): GenotypeRDD = { + newSequences: SequenceDictionary): GenotypeDataset = { copy(sequences = newSequences) } - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): GenotypeRDD = { + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): GenotypeDataset = { copy(headerLines = newHeaderLines) } - def replaceSamples(newSamples: Iterable[Sample]): GenotypeRDD = { + def replaceSamples(newSamples: Iterable[Sample]): GenotypeDataset = { copy(samples = newSamples.toSeq) } } -case class DatasetBoundGenotypeRDD private[rdd] ( +case class DatasetBoundGenotypeDataset private[rdd] ( dataset: Dataset[GenotypeProduct], sequences: SequenceDictionary, @transient samples: Seq[Sample], @transient headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines, override val isPartitioned: Boolean = true, override val optPartitionBinSize: Option[Int] = Some(1000000), - override val optLookbackPartitions: Option[Int] = Some(1)) extends GenotypeRDD - with DatasetBoundGenomicDataset[Genotype, GenotypeProduct, GenotypeRDD] { + override val optLookbackPartitions: Option[Int] = Some(1)) extends GenotypeDataset + with DatasetBoundGenomicDataset[Genotype, GenotypeProduct, GenotypeDataset] { protected lazy val optPartitionMap = None @@ -181,24 +181,24 @@ case class DatasetBoundGenotypeRDD private[rdd] ( } override def transformDataset( - tFn: Dataset[GenotypeProduct] => Dataset[GenotypeProduct]): GenotypeRDD = { + tFn: Dataset[GenotypeProduct] => Dataset[GenotypeProduct]): GenotypeDataset = { copy(dataset = tFn(dataset)) } def replaceSequences( - newSequences: SequenceDictionary): GenotypeRDD = { + newSequences: SequenceDictionary): GenotypeDataset = { copy(sequences = newSequences) } - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): GenotypeRDD = { + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): GenotypeDataset = { copy(headerLines = newHeaderLines) } - def replaceSamples(newSamples: Iterable[Sample]): GenotypeRDD = { + def replaceSamples(newSamples: Iterable[Sample]): GenotypeDataset = { copy(samples = newSamples.toSeq) } - override def copyVariantEndToAttribute(): GenotypeRDD = { + override def copyVariantEndToAttribute(): GenotypeDataset = { def copyEnd(g: GenotypeProduct): GenotypeProduct = { val variant = g.variant.getOrElse(VariantProduct()) val annotation = variant.annotation.getOrElse(VariantAnnotationProduct()) @@ -212,23 +212,23 @@ case class DatasetBoundGenotypeRDD private[rdd] ( transformDataset(dataset => dataset.map(copyEnd)) } - override def filterToFiltersPassed(): GenotypeRDD = { + override def filterToFiltersPassed(): GenotypeDataset = { transformDataset(dataset => dataset.filter(dataset.col("variantCallingAnnotations.filtersPassed"))) } - override def filterByQuality(minimumQuality: Double): GenotypeRDD = { + override def filterByQuality(minimumQuality: Double): GenotypeDataset = { transformDataset(dataset => dataset.filter(dataset.col("genotypeQuality") >= minimumQuality)) } - override def filterByReadDepth(minimumReadDepth: Int): GenotypeRDD = { + override def filterByReadDepth(minimumReadDepth: Int): GenotypeDataset = { transformDataset(dataset => dataset.filter(dataset.col("readDepth") >= minimumReadDepth)) } - override def filterByAlternateReadDepth(minimumAlternateReadDepth: Int): GenotypeRDD = { + override def filterByAlternateReadDepth(minimumAlternateReadDepth: Int): GenotypeDataset = { transformDataset(dataset => dataset.filter(dataset.col("alternateReadDepth") >= minimumAlternateReadDepth)) } - override def filterByReferenceReadDepth(minimumReferenceReadDepth: Int): GenotypeRDD = { + override def filterByReferenceReadDepth(minimumReferenceReadDepth: Int): GenotypeDataset = { transformDataset(dataset => dataset.filter(dataset.col("referenceReadDepth") >= minimumReferenceReadDepth)) } @@ -245,15 +245,15 @@ case class DatasetBoundGenotypeRDD private[rdd] ( } } -case class RDDBoundGenotypeRDD private[rdd] ( +case class RDDBoundGenotypeDataset private[rdd] ( rdd: RDD[Genotype], sequences: SequenceDictionary, @transient samples: Seq[Sample], @transient headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines, - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None) extends GenotypeRDD { + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None) extends GenotypeDataset { /** - * A SQL Dataset of reads. + * A SQL Dataset of genotypes. */ lazy val dataset: Dataset[GenotypeProduct] = { val sqlContext = SQLContext.getOrCreate(rdd.context) @@ -262,20 +262,20 @@ case class RDDBoundGenotypeRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): GenotypeRDD = { + newSequences: SequenceDictionary): GenotypeDataset = { copy(sequences = newSequences) } - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): GenotypeRDD = { + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): GenotypeDataset = { copy(headerLines = newHeaderLines) } - def replaceSamples(newSamples: Iterable[Sample]): GenotypeRDD = { + def replaceSamples(newSamples: Iterable[Sample]): GenotypeDataset = { copy(samples = newSamples.toSeq) } } -sealed abstract class GenotypeRDD extends MultisampleAvroGenomicDataset[Genotype, GenotypeProduct, GenotypeRDD] with VCFSupportingGenomicDataset[Genotype, GenotypeProduct, GenotypeRDD] { +sealed abstract class GenotypeDataset extends MultisampleAvroGenomicDataset[Genotype, GenotypeProduct, GenotypeDataset] with VCFSupportingGenomicDataset[Genotype, GenotypeProduct, GenotypeDataset] { protected val productFn = GenotypeProduct.fromAvro(_) protected val unproductFn = (g: GenotypeProduct) => g.toAvro @@ -303,12 +303,12 @@ sealed abstract class GenotypeRDD extends MultisampleAvroGenomicDataset[Genotype saveVcfHeaders(filePath) } - def union(rdds: GenotypeRDD*): GenotypeRDD = { - val iterableRdds = rdds.toSeq - GenotypeRDD(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), - iterableRdds.map(_.sequences).fold(sequences)(_ ++ _), - (samples ++ iterableRdds.flatMap(_.samples)).distinct, - (headerLines ++ iterableRdds.flatMap(_.headerLines)).distinct) + def union(datasets: GenotypeDataset*): GenotypeDataset = { + val iterableDatasets = datasets.toSeq + GenotypeDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _), + (samples ++ iterableDatasets.flatMap(_.samples)).distinct, + (headerLines ++ iterableDatasets.flatMap(_.headerLines)).distinct) } protected def buildTree(rdd: RDD[(ReferenceRegion, Genotype)])( @@ -317,22 +317,22 @@ sealed abstract class GenotypeRDD extends MultisampleAvroGenomicDataset[Genotype } /** - * Applies a function that transforms the underlying RDD into a new RDD using + * Applies a function that transforms the underlying Dataset into a new Dataset using * the Spark SQL API. * - * @param tFn A function that transforms the underlying RDD as a Dataset. - * @return A new RDD where the RDD of genomic data has been replaced, but the + * @param tFn A function that transforms the underlying Dataset as a Dataset. + * @return A new genomic dataset where the Dataset of genomic data has been replaced, but the * metadata (sequence dictionary, and etc) is copied without modification. */ def transformDataset( - tFn: Dataset[GenotypeProduct] => Dataset[GenotypeProduct]): GenotypeRDD = { - DatasetBoundGenotypeRDD(tFn(dataset), sequences, samples, headerLines) + tFn: Dataset[GenotypeProduct] => Dataset[GenotypeProduct]): GenotypeDataset = { + DatasetBoundGenotypeDataset(tFn(dataset), sequences, samples, headerLines) } /** - * @return Returns this GenotypeRDD squared off as a VariantContextRDD. + * @return Returns this GenotypeDataset squared off as a VariantContextDataset. */ - def toVariantContexts(): VariantContextRDD = { + def toVariantContexts(): VariantContextDataset = { val vcIntRdd: RDD[(RichVariant, Genotype)] = rdd.keyBy(g => { RichVariant.genotypeToRichVariant(g) }) @@ -343,32 +343,32 @@ sealed abstract class GenotypeRDD extends MultisampleAvroGenomicDataset[Genotype } } - VariantContextRDD(vcRdd, sequences, samples, headerLines) + VariantContextDataset(vcRdd, sequences, samples, headerLines) } /** - * Extracts the variants contained in this RDD of genotypes. + * Extracts the variants contained in this genomic dataset of genotypes. * * Does not perform any filtering looking at whether the variant was called or * not. Does not dedupe the variants. * - * @return Returns the variants described by this GenotypeRDD. + * @return Returns the variants described by this GenotypeDataset. */ - def toVariants(): VariantRDD = { + def toVariants(): VariantDataset = { toVariants(dedupe = false) } /** - * Extracts the variants contained in this RDD of genotypes. + * Extracts the variants contained in this genomic dataset of genotypes. * * Does not perform any filtering looking at whether the variant was called or * not. * * @param dedupe If true, drops variants described in more than one genotype * record. - * @return Returns the variants described by this GenotypeRDD. + * @return Returns the variants described by this GenotypeDataset. */ - def toVariants(dedupe: java.lang.Boolean): VariantRDD = { + def toVariants(dedupe: java.lang.Boolean): VariantDataset = { val sqlContext = SQLContext.getOrCreate(rdd.context) import sqlContext.implicits._ @@ -388,15 +388,15 @@ sealed abstract class GenotypeRDD extends MultisampleAvroGenomicDataset[Genotype notDedupedVariants } - VariantRDD(maybeDedupedVariants, sequences, headerLines) + VariantDataset(maybeDedupedVariants, sequences, headerLines) } /** * Copy variant end to a variant attribute (VCF INFO field "END"). * - * @return GenotypeRDD with variant end copied to a variant attribute. + * @return GenotypeDataset with variant end copied to a variant attribute. */ - def copyVariantEndToAttribute(): GenotypeRDD = { + def copyVariantEndToAttribute(): GenotypeDataset = { def copyEnd(g: Genotype): Genotype = { val variant = Option(g.variant).getOrElse(new Variant()) val annotation = Option(variant.annotation).getOrElse(new VariantAnnotation()) @@ -411,90 +411,90 @@ sealed abstract class GenotypeRDD extends MultisampleAvroGenomicDataset[Genotype } /** - * Filter this GenotypeRDD to genotype filters passed (VCF FORMAT field "FT" value PASS). + * Filter this GenotypeDataset to genotype filters passed (VCF FORMAT field "FT" value PASS). * - * @return GenotypeRDD filtered to genotype filters passed. + * @return GenotypeDataset filtered to genotype filters passed. */ - def filterToFiltersPassed(): GenotypeRDD = { + def filterToFiltersPassed(): GenotypeDataset = { transform(rdd => rdd.filter(g => Option(g.getVariantCallingAnnotations).exists(_.getFiltersPassed))) } /** - * Filter this GenotypeRDD by quality (VCF FORMAT field "GQ"). + * Filter this GenotypeDataset by quality (VCF FORMAT field "GQ"). * * @param minimumQuality Minimum quality to filter by, inclusive. - * @return GenotypeRDD filtered by quality. + * @return GenotypeDataset filtered by quality. */ - def filterByQuality(minimumQuality: Double): GenotypeRDD = { + def filterByQuality(minimumQuality: Double): GenotypeDataset = { transform(rdd => rdd.filter(g => Option(g.getGenotypeQuality).exists(_ >= minimumQuality))) } /** - * Filter this GenotypeRDD by read depth (VCF FORMAT field "DP"). + * Filter this GenotypeDataset by read depth (VCF FORMAT field "DP"). * * @param minimumReadDepth Minimum read depth to filter by, inclusive. - * @return GenotypeRDD filtered by read depth. + * @return GenotypeDataset filtered by read depth. */ - def filterByReadDepth(minimumReadDepth: Int): GenotypeRDD = { + def filterByReadDepth(minimumReadDepth: Int): GenotypeDataset = { transform(rdd => rdd.filter(g => Option(g.getReadDepth).exists(_ >= minimumReadDepth))) } /** - * Filter this GenotypeRDD by alternate read depth (VCF FORMAT field "AD"). + * Filter this GenotypeDataset by alternate read depth (VCF FORMAT field "AD"). * * @param minimumAlternateReadDepth Minimum alternate read depth to filter by, inclusive. - * @return GenotypeRDD filtered by alternate read depth. + * @return GenotypeDataset filtered by alternate read depth. */ - def filterByAlternateReadDepth(minimumAlternateReadDepth: Int): GenotypeRDD = { + def filterByAlternateReadDepth(minimumAlternateReadDepth: Int): GenotypeDataset = { transform(rdd => rdd.filter(g => Option(g.getAlternateReadDepth).exists(_ >= minimumAlternateReadDepth))) } /** - * Filter this GenotypeRDD by reference read depth (VCF FORMAT field "AD"). + * Filter this GenotypeDataset by reference read depth (VCF FORMAT field "AD"). * * @param minimumReferenceReadDepth Minimum reference read depth to filter by, inclusive. - * @return GenotypeRDD filtered by reference read depth. + * @return GenotypeDataset filtered by reference read depth. */ - def filterByReferenceReadDepth(minimumReferenceReadDepth: Int): GenotypeRDD = { + def filterByReferenceReadDepth(minimumReferenceReadDepth: Int): GenotypeDataset = { transform(rdd => rdd.filter(g => Option(g.getReferenceReadDepth).exists(_ >= minimumReferenceReadDepth))) } /** - * Filter this GenotypeRDD by sample to those that match the specified sample. + * Filter this GenotypeDataset by sample to those that match the specified sample. * * @param sampleId Sample to filter by. - * return GenotypeRDD filtered by sample. + * return GenotypeDataset filtered by sample. */ - def filterToSample(sampleId: String): GenotypeRDD = { + def filterToSample(sampleId: String): GenotypeDataset = { transform(rdd => rdd.filter(g => Option(g.getSampleId).exists(_ == sampleId))) } /** - * Filter this GenotypeRDD by sample to those that match the specified samples. + * Filter this GenotypeDataset by sample to those that match the specified samples. * * @param sampleIds Sequence of samples to filter by. - * return GenotypeRDD filtered by one or more samples. + * return GenotypeDataset filtered by one or more samples. */ - def filterToSamples(sampleIds: Seq[String]): GenotypeRDD = { + def filterToSamples(sampleIds: Seq[String]): GenotypeDataset = { transform(rdd => rdd.filter(g => Option(g.getSampleId).exists(sampleIds.contains(_)))) } /** - * Filter genotypes containing NO_CALL alleles from this GenotypeRDD. + * Filter genotypes containing NO_CALL alleles from this GenotypeDataset. * - * @return GenotypeRDD filtered to remove genotypes containing NO_CALL alleles. + * @return GenotypeDataset filtered to remove genotypes containing NO_CALL alleles. */ - def filterNoCalls(): GenotypeRDD = { + def filterNoCalls(): GenotypeDataset = { transform(rdd => rdd.filter(g => !g.getAlleles.contains(GenotypeAllele.NO_CALL))) } /** * @param newRdd An RDD to replace the underlying RDD with. - * @return Returns a new GenotypeRDD with the underlying RDD replaced. + * @return Returns a new GenotypeDataset with the underlying RDD replaced. */ protected def replaceRdd(newRdd: RDD[Genotype], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): GenotypeRDD = { - RDDBoundGenotypeRDD(newRdd, sequences, samples, headerLines, newPartitionMap) + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): GenotypeDataset = { + RDDBoundGenotypeDataset(newRdd, sequences, samples, headerLines, newPartitionMap) } /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala index 54cf12ed65..544bc30c32 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VCFInFormatter.scala @@ -37,20 +37,20 @@ import scala.collection.JavaConversions._ /** * InFormatter companion that builds a VCFInFormatter to write VCF to a pipe. */ -object VCFInFormatter extends InFormatterCompanion[VariantContext, VariantContextProduct, VariantContextRDD, VCFInFormatter] { +object VCFInFormatter extends InFormatterCompanion[VariantContext, VariantContextProduct, VariantContextDataset, VCFInFormatter] { /** - * Apply method for building the VCFInFormatter from a VariantContextRDD. + * Apply method for building the VCFInFormatter from a VariantContextDataset. * - * @param gRdd VariantContextRDD to build VCF header from. + * @param gDataset VariantContextDataset to build VCF header from. * @return A constructed VCFInFormatter with all needed metadata to write a * VCF header. */ - def apply(gRdd: VariantContextRDD): VCFInFormatter = { - VCFInFormatter(gRdd.sequences, - gRdd.samples.map(_.getSampleId), - gRdd.headerLines, - gRdd.rdd.context.hadoopConfiguration) + def apply(gDataset: VariantContextDataset): VCFInFormatter = { + VCFInFormatter(gDataset.sequences, + gDataset.samples.map(_.getSampleId), + gDataset.headerLines, + gDataset.rdd.context.hadoopConfiguration) } } @@ -58,7 +58,7 @@ case class VCFInFormatter private ( sequences: SequenceDictionary, samples: Seq[String], headerLines: Seq[VCFHeaderLine], - @transient val conf: Configuration) extends InFormatter[VariantContext, VariantContextProduct, VariantContextRDD, VCFInFormatter] { + @transient val conf: Configuration) extends InFormatter[VariantContext, VariantContextProduct, VariantContextDataset, VCFInFormatter] { protected val companion = VCFInFormatter diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextDataset.scala similarity index 81% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextDataset.scala index 2dcb47adfa..08a8df1ad3 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantContextDataset.scala @@ -98,70 +98,70 @@ private[adam] class VariantContextArraySerializer extends IntervalArraySerialize } } -object VariantContextRDD extends Serializable { +object VariantContextDataset extends Serializable { /** - * Builds a VariantContextRDD without a partition map. + * Builds a VariantContextDataset without a partition map. * * @param rdd The underlying VariantContext RDD. - * @param sequences The sequence dictionary for the RDD. - * @param samples The samples for the RDD. - * @param headerLines The header lines for the RDD. - * @return A new VariantContextRDD. + * @param sequences The sequence dictionary for the genomic dataset. + * @param samples The samples for the genomic dataset. + * @param headerLines The header lines for the genomic dataset. + * @return A new VariantContextDataset. */ def apply(rdd: RDD[VariantContext], sequences: SequenceDictionary, samples: Iterable[Sample], - headerLines: Seq[VCFHeaderLine]): VariantContextRDD = { - RDDBoundVariantContextRDD(rdd, sequences, samples.toSeq, headerLines, None) + headerLines: Seq[VCFHeaderLine]): VariantContextDataset = { + RDDBoundVariantContextDataset(rdd, sequences, samples.toSeq, headerLines, None) } def apply(rdd: RDD[VariantContext], sequences: SequenceDictionary, - samples: Iterable[Sample]): VariantContextRDD = { - RDDBoundVariantContextRDD(rdd, sequences, samples.toSeq, null) + samples: Iterable[Sample]): VariantContextDataset = { + RDDBoundVariantContextDataset(rdd, sequences, samples.toSeq, null) } } -case class DatasetBoundVariantContextRDD private[rdd] ( +case class DatasetBoundVariantContextDataset private[rdd] ( dataset: Dataset[VariantContextProduct], sequences: SequenceDictionary, @transient samples: Seq[Sample], @transient headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines, override val isPartitioned: Boolean = true, override val optPartitionBinSize: Option[Int] = Some(1000000), - override val optLookbackPartitions: Option[Int] = Some(1)) extends VariantContextRDD - with DatasetBoundGenomicDataset[VariantContext, VariantContextProduct, VariantContextRDD] { + override val optLookbackPartitions: Option[Int] = Some(1)) extends VariantContextDataset + with DatasetBoundGenomicDataset[VariantContext, VariantContextProduct, VariantContextDataset] { protected lazy val optPartitionMap = None lazy val rdd = dataset.rdd.map(_.toModel) override def transformDataset( - tFn: Dataset[VariantContextProduct] => Dataset[VariantContextProduct]): VariantContextRDD = { + tFn: Dataset[VariantContextProduct] => Dataset[VariantContextProduct]): VariantContextDataset = { copy(dataset = tFn(dataset)) } def replaceSequences( - newSequences: SequenceDictionary): VariantContextRDD = { + newSequences: SequenceDictionary): VariantContextDataset = { copy(sequences = newSequences) } - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantContextRDD = { + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantContextDataset = { copy(headerLines = newHeaderLines) } - def replaceSamples(newSamples: Iterable[Sample]): VariantContextRDD = { + def replaceSamples(newSamples: Iterable[Sample]): VariantContextDataset = { copy(samples = newSamples.toSeq) } } -case class RDDBoundVariantContextRDD private[rdd] ( +case class RDDBoundVariantContextDataset private[rdd] ( rdd: RDD[VariantContext], sequences: SequenceDictionary, @transient samples: Seq[Sample], @transient headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines, - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None) extends VariantContextRDD { + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None) extends VariantContextDataset { /** * A SQL Dataset of variant contexts. @@ -173,23 +173,23 @@ case class RDDBoundVariantContextRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): VariantContextRDD = { + newSequences: SequenceDictionary): VariantContextDataset = { copy(sequences = newSequences) } - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantContextRDD = { + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantContextDataset = { copy(headerLines = newHeaderLines) } - def replaceSamples(newSamples: Iterable[Sample]): VariantContextRDD = { + def replaceSamples(newSamples: Iterable[Sample]): VariantContextDataset = { copy(samples = newSamples.toSeq) } } /** - * An RDD containing VariantContexts attached to a reference and samples. + * An genomic dataset containing VariantContexts attached to a reference and samples. */ -sealed abstract class VariantContextRDD extends MultisampleGenomicDataset[VariantContext, VariantContextProduct, VariantContextRDD] with GenomicDataset[VariantContext, VariantContextProduct, VariantContextRDD] with Logging with VCFSupportingGenomicDataset[VariantContext, VariantContextProduct, VariantContextRDD] { +sealed abstract class VariantContextDataset extends MultisampleGenomicDataset[VariantContext, VariantContextProduct, VariantContextDataset] with GenomicDataset[VariantContext, VariantContextProduct, VariantContextDataset] with Logging with VCFSupportingGenomicDataset[VariantContext, VariantContextProduct, VariantContextDataset] { protected val productFn = VariantContextProduct.fromModel(_) protected val unproductFn = (vc: VariantContextProduct) => vc.toModel @@ -228,37 +228,37 @@ sealed abstract class VariantContextRDD extends MultisampleGenomicDataset[Varian } def transformDataset( - tFn: Dataset[VariantContextProduct] => Dataset[VariantContextProduct]): VariantContextRDD = { - DatasetBoundVariantContextRDD(tFn(dataset), sequences, samples, headerLines) + tFn: Dataset[VariantContextProduct] => Dataset[VariantContextProduct]): VariantContextDataset = { + DatasetBoundVariantContextDataset(tFn(dataset), sequences, samples, headerLines) } /** - * Replaces the header lines attached to this RDD. + * Replaces the header lines attached to this genomic dataset. * - * @param newHeaderLines The new header lines to attach to this RDD. - * @return A new RDD with the header lines replaced. + * @param newHeaderLines The new header lines to attach to this genomic dataset. + * @return A new genomic dataset with the header lines replaced. */ - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantContextRDD + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantContextDataset protected def buildTree(rdd: RDD[(ReferenceRegion, VariantContext)])( implicit tTag: ClassTag[VariantContext]): IntervalArray[ReferenceRegion, VariantContext] = { IntervalArray(rdd, VariantContextArray.apply(_, _)) } - def union(rdds: VariantContextRDD*): VariantContextRDD = { - val iterableRdds = rdds.toSeq - VariantContextRDD( - rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), - iterableRdds.map(_.sequences).fold(sequences)(_ ++ _), - (samples ++ iterableRdds.flatMap(_.samples)).distinct, - (headerLines ++ iterableRdds.flatMap(_.headerLines)).distinct) + def union(datasets: VariantContextDataset*): VariantContextDataset = { + val iterableDatasets = datasets.toSeq + VariantContextDataset( + rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _), + (samples ++ iterableDatasets.flatMap(_.samples)).distinct, + (headerLines ++ iterableDatasets.flatMap(_.headerLines)).distinct) } /** - * @return Returns a GenotypeRDD containing the Genotypes in this RDD. + * @return Returns a GenotypeDataset containing the Genotypes in this genomic dataset. */ - def toGenotypes(): GenotypeRDD = { - new RDDBoundGenotypeRDD(rdd.flatMap(_.genotypes), + def toGenotypes(): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd.flatMap(_.genotypes), sequences, samples, headerLines, @@ -266,10 +266,10 @@ sealed abstract class VariantContextRDD extends MultisampleGenomicDataset[Varian } /** - * @return Returns the Variants in this RDD. + * @return Returns the Variants in this genomic dataset. */ - def toVariants(): VariantRDD = { - new RDDBoundVariantRDD(rdd.map(_.variant.variant), + def toVariants(): VariantDataset = { + new RDDBoundVariantDataset(rdd.map(_.variant.variant), sequences, headerLines.filter(hl => hl match { case fl: VCFFormatHeaderLine => false @@ -279,7 +279,7 @@ sealed abstract class VariantContextRDD extends MultisampleGenomicDataset[Varian } /** - * Converts an RDD of ADAM VariantContexts to HTSJDK VariantContexts + * Converts an genomic dataset of ADAM VariantContexts to HTSJDK VariantContexts * and saves to disk as VCF. * * File paths that end in .gz or .bgz will be saved as block GZIP compressed VCFs. @@ -299,7 +299,7 @@ sealed abstract class VariantContextRDD extends MultisampleGenomicDataset[Varian } /** - * Converts an RDD of ADAM VariantContexts to HTSJDK VariantContexts + * Converts an genomic dataset of ADAM VariantContexts to HTSJDK VariantContexts * and saves as a single file to disk as VCF. Uses lenient validation * stringency. * @@ -317,7 +317,7 @@ sealed abstract class VariantContextRDD extends MultisampleGenomicDataset[Varian } /** - * Converts an RDD of ADAM VariantContexts to HTSJDK VariantContexts + * Converts an genomic dataset of ADAM VariantContexts to HTSJDK VariantContexts * and saves to disk as VCF. * * File paths that end in .gz or .bgz will be saved as block GZIP compressed VCFs. @@ -443,12 +443,12 @@ sealed abstract class VariantContextRDD extends MultisampleGenomicDataset[Varian /** * @param newRdd The RDD of VariantContexts to replace the underlying RDD. - * @return Returns a new VariantContextRDD where the underlying RDD has + * @return Returns a new VariantContextDataset where the underlying RDD has * been replaced. */ protected def replaceRdd(newRdd: RDD[VariantContext], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): VariantContextRDD = { - RDDBoundVariantContextRDD(newRdd, + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): VariantContextDataset = { + RDDBoundVariantContextDataset(newRdd, sequences, samples, headerLines, diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantDataset.scala similarity index 75% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantRDD.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantDataset.scala index 24a56bdaec..22ee6b904a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variant/VariantDataset.scala @@ -74,10 +74,10 @@ private[adam] class VariantArraySerializer extends IntervalArraySerializer[Refer } } -object VariantRDD extends Serializable { +object VariantDataset extends Serializable { /** - * Builds a VariantRDD without a partition map. + * Builds a VariantDataset without a partition map. * * @param rdd The underlying Variant RDD. * @param sequences The sequence dictionary for the RDD. @@ -86,9 +86,9 @@ object VariantRDD extends Serializable { */ def apply(rdd: RDD[Variant], sequences: SequenceDictionary, - headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines): VariantRDD = { + headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines): VariantDataset = { - new RDDBoundVariantRDD(rdd, sequences, headerLines, None) + new RDDBoundVariantDataset(rdd, sequences, headerLines, None) } /** @@ -101,16 +101,16 @@ object VariantRDD extends Serializable { */ def apply(ds: Dataset[VariantProduct], sequences: SequenceDictionary, - headerLines: Seq[VCFHeaderLine]): VariantRDD = { - new DatasetBoundVariantRDD(ds, sequences, headerLines) + headerLines: Seq[VCFHeaderLine]): VariantDataset = { + new DatasetBoundVariantDataset(ds, sequences, headerLines) } } -case class ParquetUnboundVariantRDD private[rdd] ( +case class ParquetUnboundVariantDataset private[rdd] ( @transient private val sc: SparkContext, private val parquetFilename: String, sequences: SequenceDictionary, - @transient headerLines: Seq[VCFHeaderLine]) extends VariantRDD { + @transient headerLines: Seq[VCFHeaderLine]) extends VariantDataset { lazy val rdd: RDD[Variant] = { sc.loadParquet(parquetFilename) @@ -125,23 +125,23 @@ case class ParquetUnboundVariantRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): VariantRDD = { + newSequences: SequenceDictionary): VariantDataset = { copy(sequences = newSequences) } - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantRDD = { + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantDataset = { copy(headerLines = newHeaderLines) } } -case class DatasetBoundVariantRDD private[rdd] ( +case class DatasetBoundVariantDataset private[rdd] ( dataset: Dataset[VariantProduct], sequences: SequenceDictionary, @transient headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines, override val isPartitioned: Boolean = true, override val optPartitionBinSize: Option[Int] = Some(1000000), - override val optLookbackPartitions: Option[Int] = Some(1)) extends VariantRDD - with DatasetBoundGenomicDataset[Variant, VariantProduct, VariantRDD] { + override val optLookbackPartitions: Option[Int] = Some(1)) extends VariantDataset + with DatasetBoundGenomicDataset[Variant, VariantProduct, VariantDataset] { protected lazy val optPartitionMap = None @@ -162,65 +162,65 @@ case class DatasetBoundVariantRDD private[rdd] ( } override def transformDataset( - tFn: Dataset[VariantProduct] => Dataset[VariantProduct]): VariantRDD = { + tFn: Dataset[VariantProduct] => Dataset[VariantProduct]): VariantDataset = { copy(dataset = tFn(dataset)) } def replaceSequences( - newSequences: SequenceDictionary): VariantRDD = { + newSequences: SequenceDictionary): VariantDataset = { copy(sequences = newSequences) } - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantRDD = { + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantDataset = { copy(headerLines = newHeaderLines) } - override def filterToFiltersPassed(): VariantRDD = { + override def filterToFiltersPassed(): VariantDataset = { transformDataset(dataset => dataset.filter(dataset.col("filtersPassed"))) } - override def filterByQuality(minimumQuality: Double): VariantRDD = { + override def filterByQuality(minimumQuality: Double): VariantDataset = { transformDataset(dataset => dataset.filter(!dataset.col("splitFromMultiAllelic") && dataset.col("quality") >= minimumQuality)) } - override def filterByReadDepth(minimumReadDepth: Int): VariantRDD = { + override def filterByReadDepth(minimumReadDepth: Int): VariantDataset = { transformDataset(dataset => dataset.filter(dataset.col("annotation.readDepth") >= minimumReadDepth)) } - override def filterByReferenceReadDepth(minimumReferenceReadDepth: Int): VariantRDD = { + override def filterByReferenceReadDepth(minimumReferenceReadDepth: Int): VariantDataset = { transformDataset(dataset => dataset.filter(dataset.col("annotation.referenceReadDepth") >= minimumReferenceReadDepth)) } - override def filterSingleNucleotideVariants(): VariantRDD = { + override def filterSingleNucleotideVariants(): VariantDataset = { transformDataset(dataset => dataset.filter("LENGTH(referenceAllele) > 1 OR LENGTH(alternateAllele) > 1")) } - override def filterMultipleNucleotideVariants(): VariantRDD = { + override def filterMultipleNucleotideVariants(): VariantDataset = { transformDataset(dataset => dataset.filter("(LENGTH(referenceAllele) == 1 AND LENGTH(alternateAllele) == 1) OR LENGTH(referenceAllele) != LENGTH(alternateAllele)")) } - override def filterIndels(): VariantRDD = { + override def filterIndels(): VariantDataset = { transformDataset(dataset => dataset.filter("LENGTH(referenceAllele) == LENGTH(alternateAllele)")) } - override def filterToSingleNucleotideVariants(): VariantRDD = { + override def filterToSingleNucleotideVariants(): VariantDataset = { transformDataset(dataset => dataset.filter("LENGTH(referenceAllele) == 1 AND LENGTH(alternateAllele) == 1")) } - override def filterToMultipleNucleotideVariants(): VariantRDD = { + override def filterToMultipleNucleotideVariants(): VariantDataset = { transformDataset(dataset => dataset.filter("(LENGTH(referenceAllele) > 1 OR LENGTH(alternateAllele) > 1) AND LENGTH(referenceAllele) == LENGTH(alternateAllele)")) } - override def filterToIndels(): VariantRDD = { + override def filterToIndels(): VariantDataset = { transformDataset(dataset => dataset.filter("LENGTH(referenceAllele) != LENGTH(alternateAllele)")) } } -case class RDDBoundVariantRDD private[rdd] ( +case class RDDBoundVariantDataset private[rdd] ( rdd: RDD[Variant], sequences: SequenceDictionary, @transient headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines, - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None) extends VariantRDD { + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None) extends VariantDataset { /** * A SQL Dataset of reads. @@ -232,16 +232,16 @@ case class RDDBoundVariantRDD private[rdd] ( } def replaceSequences( - newSequences: SequenceDictionary): VariantRDD = { + newSequences: SequenceDictionary): VariantDataset = { copy(sequences = newSequences) } - def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantRDD = { + def replaceHeaderLines(newHeaderLines: Seq[VCFHeaderLine]): VariantDataset = { copy(headerLines = newHeaderLines) } } -sealed abstract class VariantRDD extends AvroGenomicDataset[Variant, VariantProduct, VariantRDD] with VCFSupportingGenomicDataset[Variant, VariantProduct, VariantRDD] { +sealed abstract class VariantDataset extends AvroGenomicDataset[Variant, VariantProduct, VariantDataset] with VCFSupportingGenomicDataset[Variant, VariantProduct, VariantDataset] { protected val productFn = VariantProduct.fromAvro(_) protected val unproductFn = (v: VariantProduct) => v.toAvro @@ -273,9 +273,9 @@ sealed abstract class VariantRDD extends AvroGenomicDataset[Variant, VariantProd IntervalArray(rdd, VariantArray.apply(_, _)) } - def union(rdds: VariantRDD*): VariantRDD = { + def union(rdds: VariantDataset*): VariantDataset = { val iterableRdds = rdds.toSeq - VariantRDD(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), + VariantDataset(rdd.context.union(rdd, iterableRdds.map(_.rdd): _*), iterableRdds.map(_.sequences).fold(sequences)(_ ++ _), (headerLines ++ iterableRdds.flatMap(_.headerLines)).distinct) } @@ -289,15 +289,15 @@ sealed abstract class VariantRDD extends AvroGenomicDataset[Variant, VariantProd * metadata (sequence dictionary, and etc) is copied without modification. */ def transformDataset( - tFn: Dataset[VariantProduct] => Dataset[VariantProduct]): VariantRDD = { - DatasetBoundVariantRDD(tFn(dataset), sequences, headerLines) + tFn: Dataset[VariantProduct] => Dataset[VariantProduct]): VariantDataset = { + DatasetBoundVariantDataset(tFn(dataset), sequences, headerLines) } /** - * @return Returns this VariantRDD as a VariantContextRDD. + * @return Returns this VariantDataset as a VariantContextDataset. */ - def toVariantContexts(): VariantContextRDD = { - new RDDBoundVariantContextRDD(rdd.map(VariantContext(_)), + def toVariantContexts(): VariantContextDataset = { + new RDDBoundVariantContextDataset(rdd.map(VariantContext(_)), sequences, Seq.empty[Sample], headerLines, @@ -305,71 +305,71 @@ sealed abstract class VariantRDD extends AvroGenomicDataset[Variant, VariantProd } /** - * Filter this VariantRDD to filters passed (VCF column 7 "FILTER" value PASS). + * Filter this VariantDataset to filters passed (VCF column 7 "FILTER" value PASS). * - * @return VariantRDD filtered to filters passed. + * @return VariantDataset filtered to filters passed. */ - def filterToFiltersPassed(): VariantRDD = { + def filterToFiltersPassed(): VariantDataset = { transform(rdd => rdd.filter(_.getFiltersPassed)) } /** - * Filter this VariantRDD by quality (VCF column 6 "QUAL"). Variants split + * Filter this VariantDataset by quality (VCF column 6 "QUAL"). Variants split * for multi-allelic sites will also be filtered out. * * @param minimumQuality Minimum quality to filter by, inclusive. - * @return VariantRDD filtered by quality. + * @return VariantDataset filtered by quality. */ - def filterByQuality(minimumQuality: Double): VariantRDD = { + def filterByQuality(minimumQuality: Double): VariantDataset = { transform(rdd => rdd.filter(v => !(Option(v.getSplitFromMultiAllelic).exists(_ == true)) && Option(v.getQuality).exists(_ >= minimumQuality))) } /** - * Filter this VariantRDD by total read depth (VCF INFO reserved key AD, Number=R, + * Filter this VariantDataset by total read depth (VCF INFO reserved key AD, Number=R, * split for multi-allelic sites into single integer values for the reference allele * (filterByReferenceReadDepth) and the alternate allele (this method)). * * @param minimumReadDepth Minimum total read depth to filter by, inclusive. - * @return VariantRDD filtered by total read depth. + * @return VariantDataset filtered by total read depth. */ - def filterByReadDepth(minimumReadDepth: Int): VariantRDD = { + def filterByReadDepth(minimumReadDepth: Int): VariantDataset = { transform(rdd => rdd.filter(v => Option(v.getAnnotation().getReadDepth).exists(_ >= minimumReadDepth))) } /** - * Filter this VariantRDD by reference total read depth (VCF INFO reserved key AD, Number=R, + * Filter this VariantDataset by reference total read depth (VCF INFO reserved key AD, Number=R, * split for multi-allelic sites into single integer values for the alternate allele * (filterByReadDepth) and the reference allele (this method)). * * @param minimumReferenceReadDepth Minimum reference total read depth to filter by, inclusive. - * @return VariantRDD filtered by reference total read depth. + * @return VariantDataset filtered by reference total read depth. */ - def filterByReferenceReadDepth(minimumReferenceReadDepth: Int): VariantRDD = { + def filterByReferenceReadDepth(minimumReferenceReadDepth: Int): VariantDataset = { transform(rdd => rdd.filter(v => Option(v.getAnnotation().getReferenceReadDepth).exists(_ >= minimumReferenceReadDepth))) } /** - * Filter single nucleotide variants (SNPs) from this VariantRDD. + * Filter single nucleotide variants (SNPs) from this VariantDataset. * - * @return VariantRDD filtered to remove single nucleotide variants (SNPs). + * @return VariantDataset filtered to remove single nucleotide variants (SNPs). */ def filterSingleNucleotideVariants() = { transform(rdd => rdd.filter(v => !RichVariant(v).isSingleNucleotideVariant)) } /** - * Filter multiple nucleotide variants (MNPs) from this VariantRDD. + * Filter multiple nucleotide variants (MNPs) from this VariantDataset. * - * @return VariantRDD filtered to remove multiple nucleotide variants (MNPs). + * @return VariantDataset filtered to remove multiple nucleotide variants (MNPs). */ def filterMultipleNucleotideVariants() = { transform(rdd => rdd.filter(v => !RichVariant(v).isMultipleNucleotideVariant)) } /** - * Filter insertions and deletions (indels) from this VariantRDD. + * Filter insertions and deletions (indels) from this VariantDataset. * - * @return VariantRDD filtered to remove insertions and deletions (indels). + * @return VariantDataset filtered to remove insertions and deletions (indels). */ def filterIndels() = { transform(rdd => rdd.filter(v => { @@ -379,27 +379,27 @@ sealed abstract class VariantRDD extends AvroGenomicDataset[Variant, VariantProd } /** - * Filter this VariantRDD to include only single nucleotide variants (SNPs). + * Filter this VariantDataset to include only single nucleotide variants (SNPs). * - * @return VariantRDD filtered to include only single nucleotide variants (SNPs). + * @return VariantDataset filtered to include only single nucleotide variants (SNPs). */ def filterToSingleNucleotideVariants() = { transform(rdd => rdd.filter(v => RichVariant(v).isSingleNucleotideVariant)) } /** - * Filter this VariantRDD to include only multiple nucleotide variants (MNPs). + * Filter this VariantDataset to include only multiple nucleotide variants (MNPs). * - * @return VariantRDD filtered to include only multiple nucleotide variants (MNPs). + * @return VariantDataset filtered to include only multiple nucleotide variants (MNPs). */ def filterToMultipleNucleotideVariants() = { transform(rdd => rdd.filter(v => RichVariant(v).isMultipleNucleotideVariant)) } /** - * Filter this VariantRDD to include only insertions and deletions (indels). + * Filter this VariantDataset to include only insertions and deletions (indels). * - * @return VariantRDD filtered to include only insertions and deletions (indels). + * @return VariantDataset filtered to include only insertions and deletions (indels). */ def filterToIndels() = { transform(rdd => rdd.filter(v => { @@ -410,11 +410,11 @@ sealed abstract class VariantRDD extends AvroGenomicDataset[Variant, VariantProd /** * @param newRdd An RDD to replace the underlying RDD with. - * @return Returns a new VariantRDD with the underlying RDD replaced. + * @return Returns a new VariantDataset with the underlying RDD replaced. */ protected def replaceRdd(newRdd: RDD[Variant], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): VariantRDD = { - RDDBoundVariantRDD(newRdd, sequences, headerLines, newPartitionMap) + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): VariantDataset = { + RDDBoundVariantDataset(newRdd, sequences, headerLines, newPartitionMap) } /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/ADAMShell.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/ADAMShell.scala index 401c7f1c85..daf2275939 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/ADAMShell.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/ADAMShell.scala @@ -26,12 +26,12 @@ import htsjdk.variant.vcf.{ } import org.apache.spark.SparkContext import org.bdgenomics.adam.models.VariantContext -import org.bdgenomics.adam.rdd.feature.FeatureRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.feature.FeatureDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD, - VariantContextRDD + GenotypeDataset, + VariantDataset, + VariantContextDataset } import org.bdgenomics.formats.avro.{ AlignmentRecord, @@ -59,14 +59,14 @@ object ADAMShell { ) /** - * Print attribute values for alignment records in the specified rdd up to the limit. + * Print attribute values for alignment records in the specified AlignmentRecordDataset up to the limit. * - * @param rdd AlignmentRecordRDD. + * @param alignments AlignmentRecordDataset. * @param keys Sequence of attribute keys. * @param limit Number of alignment records to print attribute values for. Defaults to 10. */ - def printAlignmentAttributes(rdd: AlignmentRecordRDD, keys: Seq[String], limit: Int = 10): Unit = { - printAlignmentAttributes(rdd.rdd.take(limit), keys) + def printAlignmentAttributes(alignments: AlignmentRecordDataset, keys: Seq[String], limit: Int = 10): Unit = { + printAlignmentAttributes(alignments.rdd.take(limit), keys) } private def findMatchingAttribute(key: String, attributes: String): String = { @@ -107,14 +107,14 @@ object ADAMShell { ) /** - * Print attribute values for features in the specified rdd up to the limit. + * Print attribute values for features in the specified FeatureDataset up to the limit. * - * @param rdd FeatureRDD. + * @param features FeatureDataset. * @param keys Sequence of attribute keys. * @param limit Number of features to print attribute values for. Defaults to 10. */ - def printFeatureAttributes(rdd: FeatureRDD, keys: Seq[String], limit: Int = 10): Unit = { - printFeatureAttributes(rdd.rdd.take(limit), keys) + def printFeatureAttributes(features: FeatureDataset, keys: Seq[String], limit: Int = 10): Unit = { + printFeatureAttributes(features.rdd.take(limit), keys) } /** @@ -141,14 +141,14 @@ object ADAMShell { } /** - * Print VCF FORMAT field attributes for genotypes in the specified rdd up to the limit. + * Print VCF FORMAT field attributes for genotypes in the specified GenotypeDataset up to the limit. * - * @param rdd GenotypeRDD. + * @param genotypes GenotypeDataset. * @param keys Sequence of VCF FORMAT field attribute keys. * @param limit Number of genotypes to print VCF FORMAT field attribute values for. Defaults to 10. */ - def printFormatFields(rdd: GenotypeRDD, keys: Seq[String], limit: Int = 10): Unit = { - printFormatFields(rdd.rdd.take(limit), keys, rdd.headerLines) + def printFormatFields(genotypes: GenotypeDataset, keys: Seq[String], limit: Int = 10): Unit = { + printFormatFields(genotypes.rdd.take(limit), keys, genotypes.headerLines) } /** Genotype headers. */ @@ -189,19 +189,19 @@ object ADAMShell { } /** - * Print genotype filter values for genotypes in the specified rdd up to the limit. + * Print genotype filter values for genotypes in the specified GenotypeDataset up to the limit. * - * @param rdd GenotypeRDD. + * @param genotypes GenotypeDataset. * @param limit Number of genotypes to print genotype filter values for. Defaults to 10. */ - def printGenotypeFilters(rdd: GenotypeRDD, limit: Int = 10): Unit = { - printGenotypeFilters(rdd.rdd.take(limit), rdd.headerLines) + def printGenotypeFilters(genotypes: GenotypeDataset, limit: Int = 10): Unit = { + printGenotypeFilters(genotypes.rdd.take(limit), genotypes.headerLines) } /** * Print genotype filter values for the specified genotypes. * - * @param rdd GenotypeRDD. + * @param genotypes Sequence of genotypes. * @param headerLines Sequence of VCF header lines. */ def printGenotypeFilters(genotypes: Seq[Genotype], headerLines: Seq[VCFHeaderLine]): Unit = { @@ -231,9 +231,9 @@ object ADAMShell { } /** - * Print attribute values for the specified features. + * Print attribute values for the specified samples. * - * @param alignments Sequence of features. + * @param samples Sequence of samples. * @param keys Sequence of attribute keys. */ def printSampleAttributes(samples: Seq[Sample], keys: Seq[String]): Unit = { @@ -251,13 +251,13 @@ object ADAMShell { } /** - * Print filter values for variants in the specified rdd up to the limit. + * Print filter values for variants in the specified VariantDataset up to the limit. * - * @param rdd VariantRDD. + * @param variants VariantDataset. * @param limit Number of variants to print filter values for. Defaults to 10. */ - def printVariantFilters(rdd: VariantRDD, limit: Int = 10): Unit = { - printVariantFilters(rdd.rdd.take(limit), rdd.headerLines) + def printVariantFilters(variants: VariantDataset, limit: Int = 10): Unit = { + printVariantFilters(variants.rdd.take(limit), variants.headerLines) } /** Variant headers. */ @@ -300,14 +300,14 @@ object ADAMShell { } /** - * Print VCF INFO field attributes for variants in the specified rdd up to the limit. + * Print VCF INFO field attributes for variants in the specified VariantDataset up to the limit. * - * @param rdd VariantRDD. + * @param variants VariantDataset. * @param keys Sequence of VCF INFO field attribute keys. * @param limit Number of variants to print VCF INFO field attribute values for. Defaults to 10. */ - def printInfoFields(rdd: VariantRDD, keys: Seq[String], limit: Int = 10): Unit = { - printInfoFields(rdd.rdd.take(limit), keys, rdd.headerLines) + def printInfoFields(variants: VariantDataset, keys: Seq[String], limit: Int = 10): Unit = { + printInfoFields(variants.rdd.take(limit), keys, variants.headerLines) } /** diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastqRecordConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastqRecordConverterSuite.scala index 71f438c8f7..7f706e6721 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastqRecordConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastqRecordConverterSuite.scala @@ -45,7 +45,7 @@ class FastqRecordConverterSuite extends FunSuite with PrivateMethodTester { ("@desc", false) ) ) // not exception raised - assert(converter.readNameSuffixAndIndexOfPairMustMatch(readName, isFirstOfPair) === ()) + converter.readNameSuffixAndIndexOfPairMustMatch(readName, isFirstOfPair) } test("test parseReadInFastq, read suffix removal") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/SnpTableSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/SnpTableSuite.scala index 5ef4694723..edb14accd2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/models/SnpTableSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/SnpTableSuite.scala @@ -18,7 +18,7 @@ package org.bdgenomics.adam.models import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.variant.VariantRDD +import org.bdgenomics.adam.rdd.variant.VariantDataset import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro.Variant @@ -65,7 +65,7 @@ class SnpTableSuite extends ADAMFunSuite { }) } - def lookUpVariants(rdd: VariantRDD): SnpTable = { + def lookUpVariants(rdd: VariantDataset): SnpTable = { val table = SnpTable(rdd) val variants = rdd.rdd.collect diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index 93a7320f69..c1022c22e3 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -390,10 +390,10 @@ class ADAMContextSuite extends ADAMFunSuite { sparkTest("read a HLA fasta from GRCh38") { val inputPath = testFile("HLA_DQB1_05_01_01_02.fa") - val gRdd = sc.loadFasta(inputPath, 10000L) - assert(gRdd.sequences.records.size === 1) - assert(gRdd.sequences.records.head.name === "HLA-DQB1*05:01:01:02") - val fragments = gRdd.rdd.collect + val gDataset = sc.loadFasta(inputPath, 10000L) + assert(gDataset.sequences.records.size === 1) + assert(gDataset.sequences.records.head.name === "HLA-DQB1*05:01:01:02") + val fragments = gDataset.rdd.collect assert(fragments.size === 1) assert(fragments.head.getContigName === "HLA-DQB1*05:01:01:02") } @@ -725,11 +725,11 @@ class ADAMContextSuite extends ADAMFunSuite { val sd = sc.loadSequenceDictionary(sdPath) val path = testFile("gencode.v7.annotation.trunc10.bed") // uses "chr1" - val featureRdd = sc.sc.loadFeatures(path, optSequenceDictionary = Some(sd)) - val features: RDD[Feature] = featureRdd.rdd + val featureDs = sc.sc.loadFeatures(path, optSequenceDictionary = Some(sd)) + val features: RDD[Feature] = featureDs.rdd assert(features.count === 10) - val sequences = featureRdd.sequences + val sequences = featureDs.sequences assert(sequences.records.size === 93) assert(sequences("chr1").isDefined) assert(sequences("chr1").get.length === 249250621L) @@ -742,11 +742,11 @@ class ADAMContextSuite extends ADAMFunSuite { val sd = sc.loadSequenceDictionary(sdPath) val path = testFile("dvl1.200.bed") // uses "1" - val featureRdd = sc.sc.loadFeatures(path, optSequenceDictionary = Some(sd)) - val features: RDD[Feature] = featureRdd.rdd + val featureDs = sc.sc.loadFeatures(path, optSequenceDictionary = Some(sd)) + val features: RDD[Feature] = featureDs.rdd assert(features.count === 197) - val sequences = featureRdd.sequences + val sequences = featureDs.sequences assert(sequences.records.size === 93) assert(sequences("chr1").isDefined) assert(sequences("chr1").get.length === 249250621L) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenomicPositionPartitionerSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenomicPositionPartitionerSuite.scala index 3d44236aea..17e4c4f5f6 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenomicPositionPartitionerSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/GenomicPositionPartitionerSuite.scala @@ -112,10 +112,10 @@ class GenomicPositionPartitionerSuite extends ADAMFunSuite { import org.bdgenomics.adam.projections.AlignmentRecordField._ Projection(contigName, start, readName, readMapped) } - val gRdd = sc.loadAlignments(filename, optProjection = Some(p)) - val rdd = gRdd.rdd + val gDataset = sc.loadAlignments(filename, optProjection = Some(p)) + val rdd = gDataset.rdd - val parter = GenomicPositionPartitioner(parts, gRdd.sequences) + val parter = GenomicPositionPartitioner(parts, gDataset.sequences) assert(rdd.count() === 200) @@ -140,10 +140,10 @@ class GenomicPositionPartitionerSuite extends ADAMFunSuite { val filename = testFile("reads12.sam") val parts = 10 - val gRdd = sc.loadAlignments(filename) - val rdd = gRdd.rdd + val gDataset = sc.loadAlignments(filename) + val rdd = gDataset.rdd - val parter = GenomicPositionPartitioner(parts, gRdd.sequences) + val parter = GenomicPositionPartitioner(parts, gDataset.sequences) val p = { import org.bdgenomics.adam.projections.AlignmentRecordField._ diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/SortedGenomicRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/SortedGenomicDatasetSuite.scala similarity index 93% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/SortedGenomicRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/SortedGenomicDatasetSuite.scala index 123e5d8d80..952d3d1c42 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/SortedGenomicRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/SortedGenomicDatasetSuite.scala @@ -20,14 +20,14 @@ package org.bdgenomics.adam.rdd import org.bdgenomics.adam.converters.DefaultHeaderLines import org.bdgenomics.adam.models.{ SequenceRecord, SequenceDictionary, ReferenceRegion } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD -import org.bdgenomics.adam.rdd.feature.FeatureRDD -import org.bdgenomics.adam.rdd.variant.GenotypeRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.feature.FeatureDataset +import org.bdgenomics.adam.rdd.variant.GenotypeDataset import org.bdgenomics.formats.avro._ import org.bdgenomics.utils.misc.SparkFunSuite import scala.collection.mutable.ListBuffer -class SortedGenomicRDDSuite extends SparkFunSuite { +class SortedGenomicDatasetSuite extends SparkFunSuite { /** * Determines if a given partition map has been correctly sorted @@ -110,9 +110,9 @@ class SortedGenomicRDDSuite extends SparkFunSuite { // testing the left side with an extremely large region that is // not the last record on a partition // this test also tests the case that our - val genotypeRddBuilder = new ListBuffer[Genotype]() + val genotypeBuilder = new ListBuffer[Genotype]() - genotypeRddBuilder += { + genotypeBuilder += { Genotype.newBuilder() .setContigName("chr1") .setStart(2L) @@ -129,7 +129,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - genotypeRddBuilder += { + genotypeBuilder += { Genotype.newBuilder() .setContigName("chr1") .setStart(3L) @@ -146,7 +146,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - genotypeRddBuilder += { + genotypeBuilder += { Genotype.newBuilder() .setContigName("chr1") .setStart(6L) @@ -163,7 +163,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - genotypeRddBuilder += { + genotypeBuilder += { Genotype.newBuilder() .setContigName("chr1") .setStart(8L) @@ -180,9 +180,9 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - val featureRddBuilder = new ListBuffer[Feature]() + val featureBuilder = new ListBuffer[Feature]() - featureRddBuilder += { + featureBuilder += { Feature.newBuilder() .setContigName("chr1") .setStart(61L) @@ -190,7 +190,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - featureRddBuilder += { + featureBuilder += { Feature.newBuilder() .setContigName("chr1") .setStart(11L) @@ -198,7 +198,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - featureRddBuilder += { + featureBuilder += { Feature.newBuilder() .setContigName("chr1") .setStart(3L) @@ -206,7 +206,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - featureRddBuilder += { + featureBuilder += { Feature.newBuilder() .setContigName("chr1") .setStart(6L) @@ -214,7 +214,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - featureRddBuilder += { + featureBuilder += { Feature.newBuilder() .setContigName("chr1") .setStart(50L) @@ -222,7 +222,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { .build() } - featureRddBuilder += { + featureBuilder += { Feature.newBuilder() .setContigName("chr1") .setStart(1L) @@ -231,13 +231,13 @@ class SortedGenomicRDDSuite extends SparkFunSuite { } val genotypes = - GenotypeRDD(sc.parallelize(genotypeRddBuilder), + GenotypeDataset(sc.parallelize(genotypeBuilder), sd, Seq(), DefaultHeaderLines.allHeaderLines) .sortLexicographically(storePartitionMap = true, partitions = 2) genotypes.rdd.mapPartitionsWithIndex((idx, iter) => { iter.map(f => (idx, f)) }).collect - val features = FeatureRDD(sc.parallelize(featureRddBuilder), sd, Seq.empty) + val features = FeatureDataset(sc.parallelize(featureBuilder), sd, Seq.empty) val x = features.copartitionByReferenceRegion(genotypes) val z = x.rdd.mapPartitionsWithIndex((idx, iter) => { if (idx == 0 && iter.size != 6) { @@ -263,7 +263,7 @@ class SortedGenomicRDDSuite extends SparkFunSuite { x.sortLexicographically(storePartitionMap = true, partitions = 1600) // perform join using 1600 partitions - // 1600 is much more than the amount of data in the GenomicRDD + // 1600 is much more than the amount of data in the GenomicDataset // so we also test our ability to handle this extreme request val b = z.shuffleRegionJoin(x, Some(1600), 0L) val c = x.shuffleRegionJoin(z, Some(1600), 0L) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala similarity index 83% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala index 75dd2ed8f9..bd066062f6 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala @@ -24,13 +24,13 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{ Dataset, SQLContext } import org.bdgenomics.adam.models._ import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD, - VariantContextRDD + GenotypeDataset, + VariantDataset, + VariantContextDataset } import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, @@ -45,7 +45,7 @@ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ import scala.collection.mutable.ListBuffer -object NucleotideContigFragmentRDDSuite extends Serializable { +object NucleotideContigFragmentDatasetSuite extends Serializable { def covFn(ncf: NucleotideContigFragment): Coverage = { Coverage(ncf.getContigName, @@ -101,9 +101,9 @@ object NucleotideContigFragmentRDDSuite extends Serializable { } } -class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { +class NucleotideContigFragmentDatasetSuite extends ADAMFunSuite { - sparkTest("union two ncf rdds together") { + sparkTest("union two ncf genomic datasets together") { val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 10000L) val fragments2 = sc.loadFasta(testFile("artificial.fa")) val union = fragments1.union(fragments2) @@ -112,7 +112,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { } sparkTest("round trip a ncf to parquet") { - def testMetadata(fRdd: NucleotideContigFragmentRDD) { + def testMetadata(fRdd: NucleotideContigFragmentDataset) { val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) } @@ -143,7 +143,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { } sparkTest("round trip a ncf to partitioned parquet") { - def testMetadata(fRdd: NucleotideContigFragmentRDD) { + def testMetadata(fRdd: NucleotideContigFragmentDataset) { val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) } @@ -195,7 +195,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setContigLength(900L) .build() - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(ctg0, ctg1))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(ctg0, ctg1))) assert(rdd.sequences.containsReferenceName("chr0")) val chr0 = rdd.sequences("chr0").get @@ -219,7 +219,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .build() val region = ReferenceRegion(fragment).get - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) assert(rdd.extract(region) === "ACTGTAC") } @@ -238,7 +238,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .build() val region = new ReferenceRegion("chr1", 1L, 6L) - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) assert(rdd.extract(region) === "CTGTA") } @@ -279,7 +279,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val region0 = ReferenceRegion(fragment0).get val region1 = ReferenceRegion(fragment1).get.merge(ReferenceRegion(fragment2).get) - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment0, + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, fragment1, fragment2))) @@ -317,7 +317,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val dic = new SequenceDictionary(Vector(record)) val frags = sc.parallelize(dnas2fragments(dnas)) - val fragments = NucleotideContigFragmentRDD(frags, dic) + val fragments = NucleotideContigFragmentDataset(frags, dic) val byRegion = fragments.rdd.keyBy(ReferenceRegion(_)) @@ -369,7 +369,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val region0 = new ReferenceRegion("chr1", 1L, 6L) val region1 = new ReferenceRegion("chr2", 3L, 9L) - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment0, + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, fragment1, fragment2))) @@ -392,7 +392,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { fragments += frag } var passed = true - val rdd = NucleotideContigFragmentRDD(sc.parallelize(fragments.toList)) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(fragments.toList)) try { val result = rdd.extract(new ReferenceRegion("chr1", 0L, 1000L)) } catch { @@ -411,7 +411,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(1) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) val outputDir = Files.createTempDir() val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" @@ -434,7 +434,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(1) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) val outputDir = Files.createTempDir() val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" @@ -458,7 +458,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(null) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) val outputDir = Files.createTempDir() val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" @@ -482,7 +482,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(1) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) val outputDir = Files.createTempDir() val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" @@ -506,7 +506,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(null) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) def validate(fileName: String) { val fastaLines = scala.io.Source.fromFile(new File(fileName + "/part-00000")).getLines().toSeq @@ -549,7 +549,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(3) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment0, fragment1, fragment2))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, fragment1, fragment2))) val outputDir = Files.createTempDir() val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" @@ -592,7 +592,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(3) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment0, + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, fragment1, fragment2))) @@ -622,7 +622,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(null) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) val merged = rdd.mergeFragments() assert(merged.rdd.count == 1L) @@ -641,7 +641,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(1) .build - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment))) + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) val merged = rdd.mergeFragments() assert(merged.rdd.count == 1L) @@ -682,7 +682,7 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { .setFragments(2) .build() - val rdd = NucleotideContigFragmentRDD(sc.parallelize(List(fragment2, + val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment2, fragment1, fragment0))) val merged = rdd.mergeFragments() @@ -711,19 +711,19 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { assert(sequences.rdd.count() === 4) } - sparkTest("transform contigs to coverage rdd") { + sparkTest("transform contigs to coverage genomic dataset") { val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - def checkSave(coverage: CoverageRDD) { + def checkSave(coverage: CoverageDataset) { val tempPath = tmpLocation(".bed") coverage.save(tempPath, false, false) assert(sc.loadCoverage(tempPath).rdd.count === 8) } - val coverage = contigs.transmute[Coverage, Coverage, CoverageRDD]( + val coverage = contigs.transmute[Coverage, Coverage, CoverageDataset]( (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentRDDSuite.covFn) + rdd.map(NucleotideContigFragmentDatasetSuite.covFn) }) checkSave(coverage) @@ -731,27 +731,27 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val coverageDs: CoverageRDD = contigs.transmuteDataset[Coverage, Coverage, CoverageRDD]( + val coverageDs: CoverageDataset = contigs.transmuteDataset[Coverage, Coverage, CoverageDataset]( (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => NucleotideContigFragmentRDDSuite.covFn(r.toAvro)) + ds.map(r => NucleotideContigFragmentDatasetSuite.covFn(r.toAvro)) }) checkSave(coverageDs) } - sparkTest("transform contigs to feature rdd") { + sparkTest("transform contigs to feature genomic dataset") { val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - def checkSave(features: FeatureRDD) { + def checkSave(features: FeatureDataset) { val tempPath = tmpLocation(".bed") features.saveAsBed(tempPath) assert(sc.loadFeatures(tempPath).rdd.count === 8) } - val features: FeatureRDD = contigs.transmute[Feature, FeatureProduct, FeatureRDD]( + val features: FeatureDataset = contigs.transmute[Feature, FeatureProduct, FeatureDataset]( (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentRDDSuite.featFn) + rdd.map(NucleotideContigFragmentDatasetSuite.featFn) }) checkSave(features) @@ -759,30 +759,30 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: FeatureRDD = contigs.transmuteDataset[Feature, FeatureProduct, FeatureRDD]( + val featuresDs: FeatureDataset = contigs.transmuteDataset[Feature, FeatureProduct, FeatureDataset]( (ds: Dataset[NucleotideContigFragmentProduct]) => { ds.map(r => { FeatureProduct.fromAvro( - NucleotideContigFragmentRDDSuite.featFn(r.toAvro)) + NucleotideContigFragmentDatasetSuite.featFn(r.toAvro)) }) }) checkSave(featuresDs) } - sparkTest("transform contigs to fragment rdd") { + sparkTest("transform contigs to fragment genomic dataset") { val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - def checkSave(fragments: FragmentRDD) { + def checkSave(fragments: FragmentDataset) { val tempPath = tmpLocation(".adam") fragments.saveAsParquet(tempPath) assert(sc.loadFragments(tempPath).rdd.count === 8) } - val fragments: FragmentRDD = contigs.transmute[Fragment, FragmentProduct, FragmentRDD]( + val fragments: FragmentDataset = contigs.transmute[Fragment, FragmentProduct, FragmentDataset]( (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentRDDSuite.fragFn) + rdd.map(NucleotideContigFragmentDatasetSuite.fragFn) }) checkSave(fragments) @@ -790,30 +790,30 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val fragmentsDs: FragmentRDD = contigs.transmuteDataset[Fragment, FragmentProduct, FragmentRDD]( + val fragmentsDs: FragmentDataset = contigs.transmuteDataset[Fragment, FragmentProduct, FragmentDataset]( (ds: Dataset[NucleotideContigFragmentProduct]) => { ds.map(r => { FragmentProduct.fromAvro( - NucleotideContigFragmentRDDSuite.fragFn(r.toAvro)) + NucleotideContigFragmentDatasetSuite.fragFn(r.toAvro)) }) }) checkSave(fragmentsDs) } - sparkTest("transform contigs to read rdd") { + sparkTest("transform contigs to read genomic dataset") { val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - def checkSave(reads: AlignmentRecordRDD) { + def checkSave(reads: AlignmentRecordDataset) { val tempPath = tmpLocation(".adam") reads.saveAsParquet(tempPath) assert(sc.loadAlignments(tempPath).rdd.count === 8) } - val reads: AlignmentRecordRDD = contigs.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val reads: AlignmentRecordDataset = contigs.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentRDDSuite.readFn) + rdd.map(NucleotideContigFragmentDatasetSuite.readFn) }) checkSave(reads) @@ -821,30 +821,30 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val readsDs: AlignmentRecordRDD = contigs.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val readsDs: AlignmentRecordDataset = contigs.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (ds: Dataset[NucleotideContigFragmentProduct]) => { ds.map(r => { AlignmentRecordProduct.fromAvro( - NucleotideContigFragmentRDDSuite.readFn(r.toAvro)) + NucleotideContigFragmentDatasetSuite.readFn(r.toAvro)) }) }) checkSave(readsDs) } - sparkTest("transform contigs to genotype rdd") { + sparkTest("transform contigs to genotype genomic dataset") { val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - def checkSave(genotypes: GenotypeRDD) { + def checkSave(genotypes: GenotypeDataset) { val tempPath = tmpLocation(".adam") genotypes.saveAsParquet(tempPath) assert(sc.loadGenotypes(tempPath).rdd.count === 8) } - val genotypes: GenotypeRDD = contigs.transmute[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypes: GenotypeDataset = contigs.transmute[Genotype, GenotypeProduct, GenotypeDataset]( (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentRDDSuite.genFn) + rdd.map(NucleotideContigFragmentDatasetSuite.genFn) }) checkSave(genotypes) @@ -852,30 +852,30 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val genotypesDs: GenotypeRDD = contigs.transmuteDataset[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypesDs: GenotypeDataset = contigs.transmuteDataset[Genotype, GenotypeProduct, GenotypeDataset]( (ds: Dataset[NucleotideContigFragmentProduct]) => { ds.map(r => { GenotypeProduct.fromAvro( - NucleotideContigFragmentRDDSuite.genFn(r.toAvro)) + NucleotideContigFragmentDatasetSuite.genFn(r.toAvro)) }) }) checkSave(genotypesDs) } - sparkTest("transform contigs to variant rdd") { + sparkTest("transform contigs to variant genomic dataset") { val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - def checkSave(variants: VariantRDD) { + def checkSave(variants: VariantDataset) { val tempPath = tmpLocation(".adam") variants.saveAsParquet(tempPath) assert(sc.loadVariants(tempPath).rdd.count === 8) } - val variants: VariantRDD = contigs.transmute[Variant, VariantProduct, VariantRDD]( + val variants: VariantDataset = contigs.transmute[Variant, VariantProduct, VariantDataset]( (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentRDDSuite.varFn) + rdd.map(NucleotideContigFragmentDatasetSuite.varFn) }) checkSave(variants) @@ -883,27 +883,27 @@ class NucleotideContigFragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val variantsDs: VariantRDD = contigs.transmuteDataset[Variant, VariantProduct, VariantRDD]( + val variantsDs: VariantDataset = contigs.transmuteDataset[Variant, VariantProduct, VariantDataset]( (ds: Dataset[NucleotideContigFragmentProduct]) => { ds.map(r => { VariantProduct.fromAvro( - NucleotideContigFragmentRDDSuite.varFn(r.toAvro)) + NucleotideContigFragmentDatasetSuite.varFn(r.toAvro)) }) }) checkSave(variantsDs) } - sparkTest("transform contigs to variant context rdd") { + sparkTest("transform contigs to variant context genomic dataset") { val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - def checkSave(variantContexts: VariantContextRDD) { + def checkSave(variantContexts: VariantContextDataset) { assert(variantContexts.rdd.count === 8) } - val variantContexts: VariantContextRDD = contigs.transmute[VariantContext, VariantContextProduct, VariantContextRDD]( + val variantContexts: VariantContextDataset = contigs.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentRDDSuite.vcFn) + rdd.map(NucleotideContigFragmentDatasetSuite.vcFn) }) checkSave(variantContexts) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala similarity index 74% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala index ef86f56e54..bc2363e06c 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala @@ -27,13 +27,13 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD, - VariantContextRDD + GenotypeDataset, + VariantDataset, + VariantContextDataset } import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, @@ -47,7 +47,7 @@ import org.bdgenomics.adam.sql.{ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ -object CoverageRDDSuite extends Serializable { +object CoverageDatasetSuite extends Serializable { def ncfFn(cov: Coverage): NucleotideContigFragment = { NucleotideContigFragment.newBuilder @@ -104,7 +104,7 @@ object CoverageRDDSuite extends Serializable { } } -class CoverageRDDSuite extends ADAMFunSuite { +class CoverageDatasetSuite extends ADAMFunSuite { val sd = new SequenceDictionary(Vector(SequenceRecord("chr1", 2000L))) @@ -122,7 +122,7 @@ class CoverageRDDSuite extends ADAMFunSuite { } sparkTest("correctly saves coverage") { - def testMetadata(cRdd: CoverageRDD) { + def testMetadata(cRdd: CoverageDataset) { val sequenceRdd = cRdd.addSequence(SequenceRecord("aSequence", 1000L)) val sampleRdd = cRdd.addSample(Sample.newBuilder().setName("Sample").build()) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) @@ -133,12 +133,12 @@ class CoverageRDDSuite extends ADAMFunSuite { val f2 = Feature.newBuilder().setContigName("chr1").setStart(15).setEnd(20).setScore(2.0).build() val f3 = Feature.newBuilder().setContigName("chr2").setStart(15).setEnd(20).setScore(2.0).build() - val featureRDD: FeatureRDD = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) - val coverageRDD: CoverageRDD = featureRDD.toCoverage - testMetadata(coverageRDD) + val featuresDs: FeatureDataset = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) + val coverageDs: CoverageDataset = featuresDs.toCoverage + testMetadata(coverageDs) val outputFile = tmpLocation(".bed") - coverageRDD.save(outputFile, false, false) + coverageDs.save(outputFile, false, false) val coverage = sc.loadCoverage(outputFile) testMetadata(coverage) @@ -147,7 +147,7 @@ class CoverageRDDSuite extends ADAMFunSuite { // go to dataset and save as parquet val outputFile2 = tmpLocation(".adam") - val dsCov = coverageRDD.transformDataset(ds => ds) + val dsCov = coverage.transformDataset(ds => ds) testMetadata(dsCov) dsCov.save(outputFile2, false, false) val coverage2 = sc.loadCoverage(outputFile2) @@ -169,7 +169,7 @@ class CoverageRDDSuite extends ADAMFunSuite { // go to rdd and save as parquet val outputFile3 = tmpLocation(".adam") - coverageRDD.transform(rdd => rdd).save(outputFile3, false, false) + coverageDs.transform(rdd => rdd).save(outputFile3, false, false) val coverage3 = sc.loadCoverage(outputFile3) assert(coverage3.rdd.count == 3) assert(coverage3.dataset.count == 3) @@ -192,11 +192,11 @@ class CoverageRDDSuite extends ADAMFunSuite { val f2 = Feature.newBuilder().setContigName("chr1").setStart(15).setEnd(20).setScore(2.0).build() val f3 = Feature.newBuilder().setContigName("chr2").setStart(15).setEnd(20).setScore(2.0).build() - val featureRDD: FeatureRDD = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) - val coverageRDD: CoverageRDD = featureRDD.toCoverage + val featureDs: FeatureDataset = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) + val coverageDs: CoverageDataset = featureDs.toCoverage val outputFile = tmpLocation(".adam") - coverageRDD.save(outputFile, false, false) + coverageDs.save(outputFile, false, false) val region = ReferenceRegion("chr1", 1, 9) val predicate = region.toPredicate @@ -215,8 +215,8 @@ class CoverageRDDSuite extends ADAMFunSuite { .setName("Sample2") .build() - val c1 = RDDBoundCoverageRDD(sc.parallelize(cov.toSeq).repartition(1), sd, Seq(sample1), None) - val c2 = RDDBoundCoverageRDD(sc.parallelize(cov.toSeq).repartition(1), sd, Seq(sample2), None) + val c1 = RDDBoundCoverageDataset(sc.parallelize(cov.toSeq).repartition(1), sd, Seq(sample1), None) + val c2 = RDDBoundCoverageDataset(sc.parallelize(cov.toSeq).repartition(1), sd, Seq(sample2), None) val union = c1.union(c2) assert(union.samples.size === 2) @@ -231,11 +231,11 @@ class CoverageRDDSuite extends ADAMFunSuite { val f4 = Feature.newBuilder().setContigName("chr1").setStart(1).setEnd(10).setScore(2.0).setSampleId("S2").build() val f5 = Feature.newBuilder().setContigName("chr1").setStart(15).setEnd(20).setScore(2.0).setSampleId("S2").build() - val featureRDD: FeatureRDD = FeatureRDD(sc.parallelize(Seq(f1, f2, f3, f4, f5))) - val coverageRDD: CoverageRDD = featureRDD.toCoverage + val featureDs: FeatureDataset = FeatureDataset(sc.parallelize(Seq(f1, f2, f3, f4, f5))) + val coverageDs: CoverageDataset = featureDs.toCoverage val outputFile = tmpLocation(".adam") - coverageRDD.save(outputFile, false, false) + coverageDs.save(outputFile, false, false) val region = ReferenceRegion("chr1", 1, 9) val predicate = region.toPredicate @@ -248,9 +248,9 @@ class CoverageRDDSuite extends ADAMFunSuite { val f2 = Feature.newBuilder().setContigName("chr1").setStart(5).setEnd(7).setScore(3.0).build() val f3 = Feature.newBuilder().setContigName("chr1").setStart(7).setEnd(20).setScore(4.0).build() - val featureRDD: FeatureRDD = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) - val coverageRDD: CoverageRDD = featureRDD.toCoverage - val coverage = coverageRDD.coverage(bpPerBin = 4) + val featureDs: FeatureDataset = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) + val coverageDs: CoverageDataset = featureDs.toCoverage + val coverage = coverageDs.coverage(bpPerBin = 4) assert(coverage.rdd.count == 4) } @@ -260,10 +260,10 @@ class CoverageRDDSuite extends ADAMFunSuite { val f2 = Feature.newBuilder().setContigName("chr1").setStart(5).setEnd(7).setScore(3.0).build() val f3 = Feature.newBuilder().setContigName("chr1").setStart(7).setEnd(20).setScore(4.0).build() - val featureRDD: FeatureRDD = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) - val coverageRDD: CoverageRDD = featureRDD.toCoverage + val featureDs: FeatureDataset = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) + val coverageDs: CoverageDataset = featureDs.toCoverage - val coverage = coverageRDD + val coverage = coverageDs .aggregatedCoverage(bpPerBin = 4) assert(coverage.rdd.count == 5) @@ -273,7 +273,7 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("collapses coverage records in one partition") { val cov = generateCoverage(20) - val coverage = RDDBoundCoverageRDD(sc.parallelize(cov.toSeq).repartition(1), sd, Seq.empty, None) + val coverage = RDDBoundCoverageDataset(sc.parallelize(cov.toSeq).repartition(1), sd, Seq.empty, None) val collapsed = coverage.collapse assert(coverage.rdd.count == 20) @@ -282,7 +282,7 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("approximately collapses coverage records in multiple partitions") { val cov = generateCoverage(20) - val coverage = RDDBoundCoverageRDD(sc.parallelize(cov), sd, Seq.empty, None) + val coverage = RDDBoundCoverageDataset(sc.parallelize(cov), sd, Seq.empty, None) val collapsed = coverage.collapse assert(collapsed.rdd.count == 8) @@ -291,16 +291,16 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("transform coverage to contig rdd") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(contigs: NucleotideContigFragmentRDD) { + def checkSave(contigs: NucleotideContigFragmentDataset) { val tempPath = tmpLocation(".adam") contigs.saveAsParquet(tempPath) assert(sc.loadContigFragments(tempPath).rdd.count === 3) } - val contigs: NucleotideContigFragmentRDD = coverage.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigs: NucleotideContigFragmentDataset = coverage.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageRDDSuite.ncfFn) + rdd.map(CoverageDatasetSuite.ncfFn) }) checkSave(contigs) @@ -308,11 +308,11 @@ class CoverageRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentRDD = coverage.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigsDs: NucleotideContigFragmentDataset = coverage.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (ds: Dataset[Coverage]) => { ds.map(r => { NucleotideContigFragmentProduct.fromAvro( - CoverageRDDSuite.ncfFn(r)) + CoverageDatasetSuite.ncfFn(r)) }) }) @@ -322,16 +322,16 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("transform coverage to feature rdd") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(features: FeatureRDD) { + def checkSave(features: FeatureDataset) { val tempPath = tmpLocation(".bed") features.saveAsBed(tempPath) assert(sc.loadFeatures(tempPath).rdd.count === 3) } - val features: FeatureRDD = coverage.transmute[Feature, FeatureProduct, FeatureRDD]( + val features: FeatureDataset = coverage.transmute[Feature, FeatureProduct, FeatureDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageRDDSuite.featFn) + rdd.map(CoverageDatasetSuite.featFn) }) checkSave(features) @@ -339,11 +339,11 @@ class CoverageRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: FeatureRDD = coverage.transmuteDataset[Feature, FeatureProduct, FeatureRDD]( + val featuresDs: FeatureDataset = coverage.transmuteDataset[Feature, FeatureProduct, FeatureDataset]( (ds: Dataset[Coverage]) => { ds.map(r => { FeatureProduct.fromAvro( - CoverageRDDSuite.featFn(r)) + CoverageDatasetSuite.featFn(r)) }) }) @@ -353,16 +353,16 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("transform coverage to fragment rdd") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(fragments: FragmentRDD) { + def checkSave(fragments: FragmentDataset) { val tempPath = tmpLocation(".adam") fragments.saveAsParquet(tempPath) assert(sc.loadFragments(tempPath).rdd.count === 3) } - val fragments: FragmentRDD = coverage.transmute[Fragment, FragmentProduct, FragmentRDD]( + val fragments: FragmentDataset = coverage.transmute[Fragment, FragmentProduct, FragmentDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageRDDSuite.fragFn) + rdd.map(CoverageDatasetSuite.fragFn) }) checkSave(fragments) @@ -370,11 +370,11 @@ class CoverageRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val fragmentsDs: FragmentRDD = coverage.transmuteDataset[Fragment, FragmentProduct, FragmentRDD]( + val fragmentsDs: FragmentDataset = coverage.transmuteDataset[Fragment, FragmentProduct, FragmentDataset]( (ds: Dataset[Coverage]) => { ds.map(r => { FragmentProduct.fromAvro( - CoverageRDDSuite.fragFn(r)) + CoverageDatasetSuite.fragFn(r)) }) }) @@ -384,16 +384,16 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("transform coverage to read rdd") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(reads: AlignmentRecordRDD) { + def checkSave(reads: AlignmentRecordDataset) { val tempPath = tmpLocation(".adam") reads.saveAsParquet(tempPath) assert(sc.loadAlignments(tempPath).rdd.count === 3) } - val reads: AlignmentRecordRDD = coverage.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val reads: AlignmentRecordDataset = coverage.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageRDDSuite.readFn) + rdd.map(CoverageDatasetSuite.readFn) }) checkSave(reads) @@ -401,11 +401,11 @@ class CoverageRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val readsDs: AlignmentRecordRDD = coverage.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val readsDs: AlignmentRecordDataset = coverage.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (ds: Dataset[Coverage]) => { ds.map(r => { AlignmentRecordProduct.fromAvro( - CoverageRDDSuite.readFn(r)) + CoverageDatasetSuite.readFn(r)) }) }) @@ -415,16 +415,16 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("transform coverage to genotype rdd") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(genotypes: GenotypeRDD) { + def checkSave(genotypes: GenotypeDataset) { val tempPath = tmpLocation(".adam") genotypes.saveAsParquet(tempPath) assert(sc.loadGenotypes(tempPath).rdd.count === 3) } - val genotypes: GenotypeRDD = coverage.transmute[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypes: GenotypeDataset = coverage.transmute[Genotype, GenotypeProduct, GenotypeDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageRDDSuite.genFn) + rdd.map(CoverageDatasetSuite.genFn) }) checkSave(genotypes) @@ -432,11 +432,11 @@ class CoverageRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val genotypesDs: GenotypeRDD = coverage.transmuteDataset[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypesDs: GenotypeDataset = coverage.transmuteDataset[Genotype, GenotypeProduct, GenotypeDataset]( (ds: Dataset[Coverage]) => { ds.map(r => { GenotypeProduct.fromAvro( - CoverageRDDSuite.genFn(r)) + CoverageDatasetSuite.genFn(r)) }) }) @@ -446,16 +446,16 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("transform coverage to variant rdd") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(variants: VariantRDD) { + def checkSave(variants: VariantDataset) { val tempPath = tmpLocation(".adam") variants.saveAsParquet(tempPath) assert(sc.loadVariants(tempPath).rdd.count === 3) } - val variants: VariantRDD = coverage.transmute[Variant, VariantProduct, VariantRDD]( + val variants: VariantDataset = coverage.transmute[Variant, VariantProduct, VariantDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageRDDSuite.varFn) + rdd.map(CoverageDatasetSuite.varFn) }) checkSave(variants) @@ -463,11 +463,11 @@ class CoverageRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val variantsDs: VariantRDD = coverage.transmuteDataset[Variant, VariantProduct, VariantRDD]( + val variantsDs: VariantDataset = coverage.transmuteDataset[Variant, VariantProduct, VariantDataset]( (ds: Dataset[Coverage]) => { ds.map(r => { VariantProduct.fromAvro( - CoverageRDDSuite.varFn(r)) + CoverageDatasetSuite.varFn(r)) }) }) @@ -477,13 +477,13 @@ class CoverageRDDSuite extends ADAMFunSuite { sparkTest("transform coverage to variant context rdd") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(variantContexts: VariantContextRDD) { + def checkSave(variantContexts: VariantContextDataset) { assert(variantContexts.rdd.count === 3) } - val variantContexts: VariantContextRDD = coverage.transmute[VariantContext, VariantContextProduct, VariantContextRDD]( + val variantContexts: VariantContextDataset = coverage.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageRDDSuite.vcFn) + rdd.map(CoverageDatasetSuite.vcFn) }) checkSave(variantContexts) @@ -494,7 +494,7 @@ class CoverageRDDSuite extends ADAMFunSuite { val coverage = sc.loadCoverage(testFile("sample_coverage.bed"), optSequenceDictionary = Some(sd)) assert(coverage.sequences.containsReferenceName("chr1")) - val copy = CoverageRDD.apply(coverage.rdd, coverage.sequences, Seq.empty) + val copy = CoverageDataset.apply(coverage.rdd, coverage.sequences, Seq.empty) assert(copy.rdd.count() === coverage.rdd.count()) assert(copy.sequences.containsReferenceName("chr1")) } @@ -504,7 +504,7 @@ class CoverageRDDSuite extends ADAMFunSuite { val coverage = sc.loadCoverage(testFile("sample_coverage.bed"), optSequenceDictionary = Some(sd)) assert(coverage.sequences.containsReferenceName("chr1")) - val copy = CoverageRDD.apply(coverage.dataset, coverage.sequences, coverage.samples) + val copy = CoverageDataset.apply(coverage.dataset, coverage.sequences, coverage.samples) assert(copy.dataset.count() === coverage.dataset.count()) assert(copy.sequences.containsReferenceName("chr1")) } @@ -514,7 +514,7 @@ class CoverageRDDSuite extends ADAMFunSuite { val coverage = sc.loadCoverage(testFile("sample_coverage.bed"), optSequenceDictionary = Some(sd)) assert(coverage.sequences.containsReferenceName("chr1")) - val copy = CoverageRDD.apply(coverage.rdd) + val copy = CoverageDataset.apply(coverage.rdd) assert(copy.rdd.count() === coverage.rdd.count()) assert(copy.sequences.containsReferenceName("chr1") === false) } @@ -524,7 +524,7 @@ class CoverageRDDSuite extends ADAMFunSuite { val coverage = sc.loadCoverage(testFile("sample_coverage.bed"), optSequenceDictionary = Some(sd)) assert(coverage.sequences.containsReferenceName("chr1")) - val copy = CoverageRDD.apply(coverage.dataset) + val copy = CoverageDataset.apply(coverage.dataset) assert(copy.dataset.count() === coverage.dataset.count()) assert(copy.sequences.containsReferenceName("chr1") == false) } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala similarity index 89% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala index 4a869a5a98..e1753c6cde 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala @@ -27,13 +27,13 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD, - VariantContextRDD + GenotypeDataset, + VariantDataset, + VariantContextDataset } import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, @@ -48,7 +48,7 @@ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ import scala.io.Source -object FeatureRDDSuite extends Serializable { +object FeatureDatasetSuite extends Serializable { def covFn(f: Feature): Coverage = { Coverage(f.getContigName, @@ -102,10 +102,10 @@ object FeatureRDDSuite extends Serializable { } } -class FeatureRDDSuite extends ADAMFunSuite { +class FeatureDatasetSuite extends ADAMFunSuite { def tempLocation(suffix: String = ".adam"): String = { - val tempFile = File.createTempFile("FeatureRDDFunctionsSuite", "") + val tempFile = File.createTempFile("FeatureDatasetFunctionsSuite", "") val tempDir = tempFile.getParentFile new File(tempDir, tempFile.getName + suffix).getAbsolutePath } @@ -114,7 +114,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val inputPath = testFile("Homo_sapiens.GRCh37.75.trun100.gtf") val features = sc.loadGtf(inputPath) - val firstGtfRecord = FeatureRDD.toGtf(features.rdd.first) + val firstGtfRecord = FeatureDataset.toGtf(features.rdd.first) val gtfSplitTabs = firstGtfRecord.split('\t') assert(gtfSplitTabs.size === 9) @@ -230,7 +230,7 @@ class FeatureRDDSuite extends ADAMFunSuite { assert(lines.head === GFF3HeaderWriter.HEADER_STRING) val feature = expected.rdd.first - val gff3Columns = FeatureRDD.toGff3(feature).split('\t') + val gff3Columns = FeatureDataset.toGff3(feature).split('\t') assert(gff3Columns.size === 9) assert(gff3Columns(0) === "1") assert(gff3Columns(1) === "Ensembl") @@ -302,7 +302,7 @@ class FeatureRDDSuite extends ADAMFunSuite { expected.saveAsBed(outputPath, asSingleFile = true) val feature = expected.rdd.first - val bedCols = FeatureRDD.toBed(feature).split('\t') + val bedCols = FeatureDataset.toBed(feature).split('\t') assert(bedCols.size === 6) assert(bedCols(0) === "1") assert(bedCols(1) === "1331345") @@ -345,7 +345,7 @@ class FeatureRDDSuite extends ADAMFunSuite { expected.saveAsBed(outputPath, asSingleFile = true) val feature = expected.rdd.first - val bedCols = FeatureRDD.toBed(feature).split('\t') + val bedCols = FeatureDataset.toBed(feature).split('\t') assert(bedCols.size === 12) assert(bedCols(0) === "1") assert(bedCols(1) === "143") @@ -438,7 +438,7 @@ class FeatureRDDSuite extends ADAMFunSuite { // test single record val feature = expected.rdd.first - val interval = FeatureRDD.toInterval(feature).split('\t') + val interval = FeatureDataset.toInterval(feature).split('\t') assert(interval.size === 5) assert(interval(0) === "chr1") assert(interval(1) === "14416") @@ -452,7 +452,7 @@ class FeatureRDDSuite extends ADAMFunSuite { f.getStart == 142111441L && f.getEnd == 142111617L }).first - val rsInterval = FeatureRDD.toInterval(refseqFeature).split('\t') + val rsInterval = FeatureDataset.toInterval(refseqFeature).split('\t') assert(rsInterval.size === 5) assert(rsInterval(0) === "chr7") assert(rsInterval(1) === "142111442") @@ -520,7 +520,7 @@ class FeatureRDDSuite extends ADAMFunSuite { expected.saveAsNarrowPeak(outputPath, asSingleFile = true) val feature = expected.rdd.first - val npColumns = FeatureRDD.toNarrowPeak(feature).split('\t') + val npColumns = FeatureDataset.toNarrowPeak(feature).split('\t') assert(npColumns.size === 10) assert(npColumns(0) === "chr1") assert(npColumns(1) === "713849") @@ -550,7 +550,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f6 = fb.setContigName("1").setStart(10L).setEnd(110L).clearStrand().build() // null strand last val f7 = fb.setContigName("2").build() - val features = FeatureRDD(sc.parallelize(Seq(f7, f6, f5, f4, f3, f2, f1))) + val features = FeatureDataset(sc.parallelize(Seq(f7, f6, f5, f4, f3, f2, f1))) val sorted = features.sortByReference().rdd.collect() assert(f1 == sorted(0)) @@ -572,7 +572,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f6 = fb.setScore(0.9).build() // Double defaults to increasing sort order val f7 = fb.clearScore().build() // nulls last - val features = FeatureRDD(sc.parallelize(Seq(f7, f6, f5, f4, f3, f2, f1))) + val features = FeatureDataset(sc.parallelize(Seq(f7, f6, f5, f4, f3, f2, f1))) val sorted = features.sortByReference().rdd.collect() assert(f1 == sorted(0)) @@ -590,7 +590,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f2 = fb.setGeneId("gene2").build() val f3 = fb.clearGeneId().build() // nulls last - val features = FeatureRDD(sc.parallelize(Seq(f3, f2, f1))) + val features = FeatureDataset(sc.parallelize(Seq(f3, f2, f1))) val sorted = features.sortByReference().rdd.collect() assert(f1 == sorted(0)) @@ -606,7 +606,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f4 = fb.setGeneId("gene2").setTranscriptId("transcript2").build() val f5 = fb.setGeneId("gene2").clearTranscriptId().build() // nulls last - val features = FeatureRDD(sc.parallelize(Seq(f5, f4, f3, f2, f1))) + val features = FeatureDataset(sc.parallelize(Seq(f5, f4, f3, f2, f1))) val sorted = features.sortByReference().rdd.collect() assert(f1 == sorted(0)) @@ -628,7 +628,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f8 = fb.setGeneId("gene2").setTranscriptId("transcript1").setAttributes(ImmutableMap.of("rank", "2")).build() val f9 = fb.setGeneId("gene2").setTranscriptId("transcript1").clearAttributes().build() // nulls last - val features = FeatureRDD(sc.parallelize(Seq(f9, f8, f7, f6, f5, f4, f3, f2, f1))) + val features = FeatureDataset(sc.parallelize(Seq(f9, f8, f7, f6, f5, f4, f3, f2, f1))) val sorted = features.sortByReference().rdd.collect() assert(f1 == sorted(0)) @@ -650,7 +650,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f4 = fb.setAttributes(ImmutableMap.of("rank", "2")).build() val f5 = fb.clearAttributes().build() // nulls last - val features = FeatureRDD(sc.parallelize(Seq(f5, f4, f3, f2, f1))) + val features = FeatureDataset(sc.parallelize(Seq(f5, f4, f3, f2, f1))) val sorted = features.sortByReference().rdd.collect() assert(f1 == sorted(0)) @@ -660,14 +660,14 @@ class FeatureRDDSuite extends ADAMFunSuite { assert(f5 == sorted(4)) } - sparkTest("correctly flatmaps CoverageRDD from FeatureRDD") { + sparkTest("correctly flatmaps CoverageDataset from FeatureDataset") { val f1 = Feature.newBuilder().setContigName("chr1").setStart(1).setEnd(10).setScore(3.0).build() val f2 = Feature.newBuilder().setContigName("chr1").setStart(15).setEnd(20).setScore(2.0).build() val f3 = Feature.newBuilder().setContigName("chr2").setStart(15).setEnd(20).setScore(2.0).build() - val featureRDD: FeatureRDD = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) - val coverageRDD: CoverageRDD = featureRDD.toCoverage - val coverage = coverageRDD.flatten + val featureDs: FeatureDataset = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) + val coverageDs: CoverageDataset = featureDs.toCoverage + val coverage = coverageDs.flatten assert(coverage.rdd.count == 19) } @@ -724,8 +724,8 @@ class FeatureRDDSuite extends ADAMFunSuite { assert(jRdd.rdd.count === 5L) assert(jRdd0.rdd.count === 5L) - val joinedFeatures: FeatureRDD = jRdd - .transmute[Feature, FeatureProduct, FeatureRDD]((rdd: RDD[(Feature, Feature)]) => { + val joinedFeatures: FeatureDataset = jRdd + .transmute[Feature, FeatureProduct, FeatureDataset]((rdd: RDD[(Feature, Feature)]) => { rdd.map(_._1) }) val tempPath = tmpLocation(".adam") @@ -904,7 +904,7 @@ class FeatureRDDSuite extends ADAMFunSuite { implicit val tFormatter = BEDInFormatter implicit val uFormatter = new BEDOutFormatter - val pipedRdd: FeatureRDD = frdd.pipe[Feature, FeatureProduct, FeatureRDD, BEDInFormatter](Seq("tee", "/dev/null")) + val pipedRdd: FeatureDataset = frdd.pipe[Feature, FeatureProduct, FeatureDataset, BEDInFormatter](Seq("tee", "/dev/null")) assert(pipedRdd.rdd.count >= frdd.rdd.count) assert(pipedRdd.rdd.distinct.count === frdd.rdd.distinct.count) @@ -917,7 +917,7 @@ class FeatureRDDSuite extends ADAMFunSuite { implicit val tFormatter = GTFInFormatter implicit val uFormatter = new GTFOutFormatter - val pipedRdd: FeatureRDD = frdd.pipe[Feature, FeatureProduct, FeatureRDD, GTFInFormatter](Seq("tee", "/dev/null")) + val pipedRdd: FeatureDataset = frdd.pipe[Feature, FeatureProduct, FeatureDataset, GTFInFormatter](Seq("tee", "/dev/null")) assert(pipedRdd.rdd.count >= frdd.rdd.count) assert(pipedRdd.rdd.distinct.count === frdd.rdd.distinct.count) @@ -930,7 +930,7 @@ class FeatureRDDSuite extends ADAMFunSuite { implicit val tFormatter = GFF3InFormatter implicit val uFormatter = new GFF3OutFormatter - val pipedRdd: FeatureRDD = frdd.pipe[Feature, FeatureProduct, FeatureRDD, GFF3InFormatter](Seq("tee", "/dev/null")) + val pipedRdd: FeatureDataset = frdd.pipe[Feature, FeatureProduct, FeatureDataset, GFF3InFormatter](Seq("tee", "/dev/null")) assert(pipedRdd.rdd.count >= frdd.rdd.count) assert(pipedRdd.rdd.distinct.count === frdd.rdd.distinct.count) @@ -943,14 +943,14 @@ class FeatureRDDSuite extends ADAMFunSuite { implicit val tFormatter = NarrowPeakInFormatter implicit val uFormatter = new NarrowPeakOutFormatter - val pipedRdd: FeatureRDD = frdd.pipe[Feature, FeatureProduct, FeatureRDD, NarrowPeakInFormatter](Seq("tee", "/dev/null")) + val pipedRdd: FeatureDataset = frdd.pipe[Feature, FeatureProduct, FeatureDataset, NarrowPeakInFormatter](Seq("tee", "/dev/null")) assert(pipedRdd.rdd.count >= frdd.rdd.count) assert(pipedRdd.rdd.distinct.count === frdd.rdd.distinct.count) } sparkTest("load parquet to sql, save, re-read from avro") { - def testMetadata(fRdd: FeatureRDD) { + def testMetadata(fRdd: FeatureDataset) { val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) val sampleRdd = fRdd.addSample(Sample.newBuilder().setName("Sample").build()) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) @@ -979,7 +979,7 @@ class FeatureRDDSuite extends ADAMFunSuite { } sparkTest("load partitioned parquet to sql, save, re-read from avro") { - def testMetadata(fRdd: FeatureRDD) { + def testMetadata(fRdd: FeatureDataset) { val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) } @@ -1004,16 +1004,16 @@ class FeatureRDDSuite extends ADAMFunSuite { sparkTest("transform features to contig rdd") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(contigs: NucleotideContigFragmentRDD) { + def checkSave(contigs: NucleotideContigFragmentDataset) { val tempPath = tmpLocation(".adam") contigs.saveAsParquet(tempPath) assert(sc.loadContigFragments(tempPath).rdd.count === 3) } - val contigs: NucleotideContigFragmentRDD = features.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigs: NucleotideContigFragmentDataset = features.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureRDDSuite.ncfFn) + rdd.map(FeatureDatasetSuite.ncfFn) }) checkSave(contigs) @@ -1021,11 +1021,11 @@ class FeatureRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentRDD = features.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigsDs: NucleotideContigFragmentDataset = features.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (ds: Dataset[FeatureProduct]) => { ds.map(r => { NucleotideContigFragmentProduct.fromAvro( - FeatureRDDSuite.ncfFn(r.toAvro)) + FeatureDatasetSuite.ncfFn(r.toAvro)) }) }) @@ -1035,16 +1035,16 @@ class FeatureRDDSuite extends ADAMFunSuite { sparkTest("transform features to coverage rdd") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(coverage: CoverageRDD) { + def checkSave(coverage: CoverageDataset) { val tempPath = tmpLocation(".bed") coverage.save(tempPath, false, false) assert(sc.loadCoverage(tempPath).rdd.count === 3) } - val coverage: CoverageRDD = features.transmute[Coverage, Coverage, CoverageRDD]( + val coverage: CoverageDataset = features.transmute[Coverage, Coverage, CoverageDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureRDDSuite.covFn) + rdd.map(FeatureDatasetSuite.covFn) }) checkSave(coverage) @@ -1052,9 +1052,9 @@ class FeatureRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val coverageDs: CoverageRDD = features.transmuteDataset[Coverage, Coverage, CoverageRDD]( + val coverageDs: CoverageDataset = features.transmuteDataset[Coverage, Coverage, CoverageDataset]( (ds: Dataset[FeatureProduct]) => { - ds.map(r => FeatureRDDSuite.covFn(r.toAvro)) + ds.map(r => FeatureDatasetSuite.covFn(r.toAvro)) }) checkSave(coverageDs) @@ -1063,16 +1063,16 @@ class FeatureRDDSuite extends ADAMFunSuite { sparkTest("transform features to fragment rdd") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(fragments: FragmentRDD) { + def checkSave(fragments: FragmentDataset) { val tempPath = tmpLocation(".adam") fragments.saveAsParquet(tempPath) assert(sc.loadFragments(tempPath).rdd.count === 3) } - val fragments: FragmentRDD = features.transmute[Fragment, FragmentProduct, FragmentRDD]( + val fragments: FragmentDataset = features.transmute[Fragment, FragmentProduct, FragmentDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureRDDSuite.fragFn) + rdd.map(FeatureDatasetSuite.fragFn) }) checkSave(fragments) @@ -1080,11 +1080,11 @@ class FeatureRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val fragmentsDs: FragmentRDD = features.transmuteDataset[Fragment, FragmentProduct, FragmentRDD]( + val fragmentsDs: FragmentDataset = features.transmuteDataset[Fragment, FragmentProduct, FragmentDataset]( (ds: Dataset[FeatureProduct]) => { ds.map(r => { FragmentProduct.fromAvro( - FeatureRDDSuite.fragFn(r.toAvro)) + FeatureDatasetSuite.fragFn(r.toAvro)) }) }) @@ -1094,16 +1094,16 @@ class FeatureRDDSuite extends ADAMFunSuite { sparkTest("transform features to read rdd") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(reads: AlignmentRecordRDD) { + def checkSave(reads: AlignmentRecordDataset) { val tempPath = tmpLocation(".adam") reads.saveAsParquet(tempPath) assert(sc.loadAlignments(tempPath).rdd.count === 3) } - val reads: AlignmentRecordRDD = features.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val reads: AlignmentRecordDataset = features.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureRDDSuite.readFn) + rdd.map(FeatureDatasetSuite.readFn) }) checkSave(reads) @@ -1111,11 +1111,11 @@ class FeatureRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val readsDs: AlignmentRecordRDD = features.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val readsDs: AlignmentRecordDataset = features.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (ds: Dataset[FeatureProduct]) => { ds.map(r => { AlignmentRecordProduct.fromAvro( - FeatureRDDSuite.readFn(r.toAvro)) + FeatureDatasetSuite.readFn(r.toAvro)) }) }) @@ -1125,16 +1125,16 @@ class FeatureRDDSuite extends ADAMFunSuite { sparkTest("transform features to genotype rdd") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(genotypes: GenotypeRDD) { + def checkSave(genotypes: GenotypeDataset) { val tempPath = tmpLocation(".adam") genotypes.saveAsParquet(tempPath) assert(sc.loadGenotypes(tempPath).rdd.count === 3) } - val genotypes: GenotypeRDD = features.transmute[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypes: GenotypeDataset = features.transmute[Genotype, GenotypeProduct, GenotypeDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureRDDSuite.genFn) + rdd.map(FeatureDatasetSuite.genFn) }) checkSave(genotypes) @@ -1142,11 +1142,11 @@ class FeatureRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val genotypesDs: GenotypeRDD = features.transmuteDataset[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypesDs: GenotypeDataset = features.transmuteDataset[Genotype, GenotypeProduct, GenotypeDataset]( (ds: Dataset[FeatureProduct]) => { ds.map(r => { GenotypeProduct.fromAvro( - FeatureRDDSuite.genFn(r.toAvro)) + FeatureDatasetSuite.genFn(r.toAvro)) }) }) @@ -1156,16 +1156,16 @@ class FeatureRDDSuite extends ADAMFunSuite { sparkTest("transform features to variant rdd") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(variants: VariantRDD) { + def checkSave(variants: VariantDataset) { val tempPath = tmpLocation(".adam") variants.saveAsParquet(tempPath) assert(sc.loadVariants(tempPath).rdd.count === 3) } - val variants: VariantRDD = features.transmute[Variant, VariantProduct, VariantRDD]( + val variants: VariantDataset = features.transmute[Variant, VariantProduct, VariantDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureRDDSuite.varFn) + rdd.map(FeatureDatasetSuite.varFn) }) checkSave(variants) @@ -1173,11 +1173,11 @@ class FeatureRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val variantsDs: VariantRDD = features.transmuteDataset[Variant, VariantProduct, VariantRDD]( + val variantsDs: VariantDataset = features.transmuteDataset[Variant, VariantProduct, VariantDataset]( (ds: Dataset[FeatureProduct]) => { ds.map(r => { VariantProduct.fromAvro( - FeatureRDDSuite.varFn(r.toAvro)) + FeatureDatasetSuite.varFn(r.toAvro)) }) }) @@ -1187,13 +1187,13 @@ class FeatureRDDSuite extends ADAMFunSuite { sparkTest("transform features to variant context rdd") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(variantContexts: VariantContextRDD) { + def checkSave(variantContexts: VariantContextDataset) { assert(variantContexts.rdd.count === 3) } - val variantContexts: VariantContextRDD = features.transmute[VariantContext, VariantContextProduct, VariantContextRDD]( + val variantContexts: VariantContextDataset = features.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureRDDSuite.vcFn) + rdd.map(FeatureDatasetSuite.vcFn) }) checkSave(variantContexts) @@ -1226,7 +1226,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.1").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) assert(features.filterToGene("CCDS22.1").rdd.count() === 2) } @@ -1235,7 +1235,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.1").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) val featuresDs = features.transformDataset(ds => ds) assert(features.filterToGene("CCDS22.1").rdd.count() === 2) } @@ -1245,7 +1245,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.2").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) assert(features.filterToGenes(Seq("CCDS22.1", "CCDS22.2")).rdd.count() === 2) } @@ -1254,7 +1254,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.2").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) val featuresDs = features.transformDataset(ds => ds) assert(features.filterToGenes(Seq("CCDS22.1", "CCDS22.2")).rdd.count() === 2) } @@ -1264,7 +1264,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445648").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) assert(features.filterToTranscript("ENST00000445648").rdd.count() === 2) } @@ -1273,7 +1273,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445648").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) val featuresDs = features.transformDataset(ds => ds) assert(features.filterToTranscript("ENST00000445648").rdd.count() === 2) } @@ -1283,7 +1283,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445649").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) assert(features.filterToTranscripts(Seq("ENST00000445648", "ENST00000445649")).rdd.count() === 2) } @@ -1292,7 +1292,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445649").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) val featuresDs = features.transformDataset(ds => ds) assert(features.filterToTranscripts(Seq("ENST00000445648", "ENST00000445649")).rdd.count() === 2) } @@ -1302,7 +1302,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779983").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) assert(features.filterToExon("ENSE00001779983").rdd.count() === 2) } @@ -1311,7 +1311,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779983").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) val featuresDs = features.transformDataset(ds => ds) assert(features.filterToExon("ENSE00001779983").rdd.count() === 2) } @@ -1321,7 +1321,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779984").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) assert(features.filterToExons(Seq("ENSE00001779983", "ENSE00001779984")).rdd.count() === 2) } @@ -1330,7 +1330,7 @@ class FeatureRDDSuite extends ADAMFunSuite { val f1 = fb.setContigName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build(); val f2 = fb.setContigName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build(); val f3 = fb.setContigName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779984").build(); - val features = FeatureRDD(sc.parallelize(Seq(f1, f2, f3))) + val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3))) val featuresDs = features.transformDataset(ds => ds) assert(features.filterToExons(Seq("ENSE00001779983", "ENSE00001779984")).rdd.count() === 2) } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala similarity index 79% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala index 91f0522130..6c08cbea15 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala @@ -26,18 +26,18 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.read.{ - AlignmentRecordRDD, - AlignmentRecordRDDSuite, + AlignmentRecordDataset, + AlignmentRecordDatasetSuite, AnySAMOutFormatter, QualityScoreBin } import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD, - VariantContextRDD + GenotypeDataset, + VariantDataset, + VariantContextDataset } import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, @@ -52,22 +52,22 @@ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ import scala.collection.JavaConversions._ -object FragmentRDDSuite extends Serializable { +object FragmentDatasetSuite extends Serializable { def readFn(f: Fragment): AlignmentRecord = { f.getAlignments.get(0) } def vcFn(f: Fragment): VariantContext = { - VariantContext(AlignmentRecordRDDSuite.varFn(f)) + VariantContext(AlignmentRecordDatasetSuite.varFn(f)) } } -class FragmentRDDSuite extends ADAMFunSuite { +class FragmentDatasetSuite extends ADAMFunSuite { sparkTest("don't lose any reads when piping interleaved fastq to sam") { // write suffixes at end of reads - sc.hadoopConfiguration.setBoolean(FragmentRDD.WRITE_SUFFIXES, true) + sc.hadoopConfiguration.setBoolean(FragmentDataset.WRITE_SUFFIXES, true) val fragmentsPath = testFile("interleaved_fastq_sample1.ifq") val ardd = sc.loadFragments(fragmentsPath) @@ -82,7 +82,7 @@ class FragmentRDDSuite extends ADAMFunSuite { // this script converts interleaved fastq to unaligned sam val scriptPath = testFile("fastq_to_usam.py") - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, InterleavedFASTQInFormatter](Seq("python", "$0"), + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, InterleavedFASTQInFormatter](Seq("python", "$0"), files = Seq(scriptPath)) val newRecords = pipedRdd.rdd.count assert(2 * records === newRecords) @@ -100,7 +100,7 @@ class FragmentRDDSuite extends ADAMFunSuite { // this script converts tab5 to unaligned sam val scriptPath = testFile("tab5_to_usam.py") - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, Tab5InFormatter](Seq("python", "$0"), + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, Tab5InFormatter](Seq("python", "$0"), files = Seq(scriptPath)) val newRecords = pipedRdd.rdd.count assert(2 * records === newRecords) @@ -108,7 +108,7 @@ class FragmentRDDSuite extends ADAMFunSuite { sparkTest("don't lose any reads when piping tab6 to sam") { // write suffixes at end of reads - sc.hadoopConfiguration.setBoolean(FragmentRDD.WRITE_SUFFIXES, true) + sc.hadoopConfiguration.setBoolean(FragmentDataset.WRITE_SUFFIXES, true) val fragmentsPath = testFile("interleaved_fastq_sample1.ifq") val ardd = sc.loadFragments(fragmentsPath) @@ -121,7 +121,7 @@ class FragmentRDDSuite extends ADAMFunSuite { // this script converts tab6 to unaligned sam val scriptPath = testFile("tab6_to_usam.py") - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, Tab6InFormatter](Seq("python", "$0"), + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, Tab6InFormatter](Seq("python", "$0"), files = Seq(scriptPath)) val newRecords = pipedRdd.rdd.count assert(2 * records === newRecords) @@ -173,8 +173,8 @@ class FragmentRDDSuite extends ADAMFunSuite { assert(jRdd.rdd.count === 5) assert(jRdd0.rdd.count === 5) - val joinedFragments: FragmentRDD = jRdd - .transmute[Fragment, FragmentProduct, FragmentRDD]((rdd: RDD[(Fragment, Feature)]) => { + val joinedFragments: FragmentDataset = jRdd + .transmute[Fragment, FragmentProduct, FragmentDataset]((rdd: RDD[(Fragment, Feature)]) => { rdd.map(_._1) }) val tempPath = tmpLocation(".adam") @@ -333,7 +333,7 @@ class FragmentRDDSuite extends ADAMFunSuite { assert(qualityScoreCounts(10) === 7101) } - sparkTest("union two rdds of fragments together") { + sparkTest("union two genomic datasets of fragments together") { val reads1 = sc.loadAlignments(testFile("bqsr1.sam")).toFragments val reads2 = sc.loadAlignments(testFile("small.sam")).toFragments val union = reads1.union(reads2) @@ -345,12 +345,12 @@ class FragmentRDDSuite extends ADAMFunSuite { } sparkTest("load parquet to sql, save, re-read from avro") { - def testMetadata(fRdd: FragmentRDD) { + def testMetadata(fRdd: FragmentDataset) { val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) - val rgRdd = fRdd.addRecordGroup(RecordGroup("test", "aRg")) - assert(rgRdd.recordGroups("aRg").sample === "test") + val rgDataset = fRdd.addRecordGroup(RecordGroup("test", "aRg")) + assert(rgDataset.recordGroups("aRg").sample === "test") } val inputPath = testFile("small.sam") @@ -379,19 +379,19 @@ class FragmentRDDSuite extends ADAMFunSuite { assert(rdd4.dataset.count === 20) } - sparkTest("transform fragments to contig rdd") { + sparkTest("transform fragments to contig genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(ncRdd: NucleotideContigFragmentRDD) { + def checkSave(ncRdd: NucleotideContigFragmentDataset) { val tempPath = tmpLocation(".fa") ncRdd.saveAsFasta(tempPath) assert(sc.loadContigFragments(tempPath).rdd.count.toInt === 20) } - val features: NucleotideContigFragmentRDD = fragments.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val features: NucleotideContigFragmentDataset = fragments.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (rdd: RDD[Fragment]) => { - rdd.map(AlignmentRecordRDDSuite.ncfFn) + rdd.map(AlignmentRecordDatasetSuite.ncfFn) }) checkSave(features) @@ -399,30 +399,30 @@ class FragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: NucleotideContigFragmentRDD = fragments.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val featuresDs: NucleotideContigFragmentDataset = fragments.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (ds: Dataset[FragmentProduct]) => { ds.map(r => { NucleotideContigFragmentProduct.fromAvro( - AlignmentRecordRDDSuite.ncfFn(r.toAvro)) + AlignmentRecordDatasetSuite.ncfFn(r.toAvro)) }) }) checkSave(featuresDs) } - sparkTest("transform fragments to coverage rdd") { + sparkTest("transform fragments to coverage genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(coverage: CoverageRDD) { + def checkSave(coverage: CoverageDataset) { val tempPath = tmpLocation(".bed") coverage.save(tempPath, false, false) assert(sc.loadCoverage(tempPath).rdd.count === 20) } - val coverage: CoverageRDD = fragments.transmute[Coverage, Coverage, CoverageRDD]( + val coverage: CoverageDataset = fragments.transmute[Coverage, Coverage, CoverageDataset]( (rdd: RDD[Fragment]) => { - rdd.map(AlignmentRecordRDDSuite.covFn) + rdd.map(AlignmentRecordDatasetSuite.covFn) }) checkSave(coverage) @@ -430,27 +430,27 @@ class FragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val coverageDs: CoverageRDD = fragments.transmuteDataset[Coverage, Coverage, CoverageRDD]( + val coverageDs: CoverageDataset = fragments.transmuteDataset[Coverage, Coverage, CoverageDataset]( (ds: Dataset[FragmentProduct]) => { - ds.map(r => AlignmentRecordRDDSuite.covFn(r.toAvro)) + ds.map(r => AlignmentRecordDatasetSuite.covFn(r.toAvro)) }) checkSave(coverageDs) } - sparkTest("transform fragments to feature rdd") { + sparkTest("transform fragments to feature genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(features: FeatureRDD) { + def checkSave(features: FeatureDataset) { val tempPath = tmpLocation(".bed") features.saveAsBed(tempPath) assert(sc.loadFeatures(tempPath).rdd.count === 20) } - val features: FeatureRDD = fragments.transmute[Feature, FeatureProduct, FeatureRDD]( + val features: FeatureDataset = fragments.transmute[Feature, FeatureProduct, FeatureDataset]( (rdd: RDD[Fragment]) => { - rdd.map(AlignmentRecordRDDSuite.featFn) + rdd.map(AlignmentRecordDatasetSuite.featFn) }) checkSave(features) @@ -458,30 +458,30 @@ class FragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: FeatureRDD = fragments.transmuteDataset[Feature, FeatureProduct, FeatureRDD]( + val featuresDs: FeatureDataset = fragments.transmuteDataset[Feature, FeatureProduct, FeatureDataset]( (ds: Dataset[FragmentProduct]) => { ds.map(r => { FeatureProduct.fromAvro( - AlignmentRecordRDDSuite.featFn(r.toAvro)) + AlignmentRecordDatasetSuite.featFn(r.toAvro)) }) }) checkSave(featuresDs) } - sparkTest("transform fragments to read rdd") { + sparkTest("transform fragments to read genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(reads: AlignmentRecordRDD) { + def checkSave(reads: AlignmentRecordDataset) { val tempPath = tmpLocation(".sam") reads.saveAsSam(tempPath) assert(sc.loadAlignments(tempPath).rdd.count === 20) } - val reads: AlignmentRecordRDD = fragments.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val reads: AlignmentRecordDataset = fragments.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (rdd: RDD[Fragment]) => { - rdd.map(FragmentRDDSuite.readFn) + rdd.map(FragmentDatasetSuite.readFn) }) checkSave(reads) @@ -489,30 +489,30 @@ class FragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val readDs: AlignmentRecordRDD = fragments.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val readDs: AlignmentRecordDataset = fragments.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (ds: Dataset[FragmentProduct]) => { ds.map(r => { AlignmentRecordProduct.fromAvro( - FragmentRDDSuite.readFn(r.toAvro)) + FragmentDatasetSuite.readFn(r.toAvro)) }) }) checkSave(readDs) } - sparkTest("transform fragments to genotype rdd") { + sparkTest("transform fragments to genotype genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(genotypes: GenotypeRDD) { + def checkSave(genotypes: GenotypeDataset) { val tempPath = tmpLocation(".adam") genotypes.saveAsParquet(tempPath) assert(sc.loadGenotypes(tempPath).rdd.count === 20) } - val genotypes: GenotypeRDD = fragments.transmute[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypes: GenotypeDataset = fragments.transmute[Genotype, GenotypeProduct, GenotypeDataset]( (rdd: RDD[Fragment]) => { - rdd.map(AlignmentRecordRDDSuite.genFn) + rdd.map(AlignmentRecordDatasetSuite.genFn) }) checkSave(genotypes) @@ -520,30 +520,30 @@ class FragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val genotypesDs: GenotypeRDD = fragments.transmuteDataset[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypesDs: GenotypeDataset = fragments.transmuteDataset[Genotype, GenotypeProduct, GenotypeDataset]( (ds: Dataset[FragmentProduct]) => { ds.map(r => { GenotypeProduct.fromAvro( - AlignmentRecordRDDSuite.genFn(r.toAvro)) + AlignmentRecordDatasetSuite.genFn(r.toAvro)) }) }) checkSave(genotypesDs) } - sparkTest("transform fragments to variant rdd") { + sparkTest("transform fragments to variant genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(variants: VariantRDD) { + def checkSave(variants: VariantDataset) { val tempPath = tmpLocation(".adam") variants.saveAsParquet(tempPath) assert(sc.loadVariants(tempPath).rdd.count === 20) } - val variants: VariantRDD = fragments.transmute[Variant, VariantProduct, VariantRDD]( + val variants: VariantDataset = fragments.transmute[Variant, VariantProduct, VariantDataset]( (rdd: RDD[Fragment]) => { - rdd.map(AlignmentRecordRDDSuite.varFn) + rdd.map(AlignmentRecordDatasetSuite.varFn) }) checkSave(variants) @@ -551,27 +551,27 @@ class FragmentRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val variantsDs: VariantRDD = fragments.transmuteDataset[Variant, VariantProduct, VariantRDD]( + val variantsDs: VariantDataset = fragments.transmuteDataset[Variant, VariantProduct, VariantDataset]( (ds: Dataset[FragmentProduct]) => { ds.map(r => { VariantProduct.fromAvro( - AlignmentRecordRDDSuite.varFn(r.toAvro)) + AlignmentRecordDatasetSuite.varFn(r.toAvro)) }) }) checkSave(variantsDs) } - sparkTest("transform fragments to variant context rdd") { + sparkTest("transform fragments to variant context genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(variantContexts: VariantContextRDD) { + def checkSave(variantContexts: VariantContextDataset) { assert(variantContexts.rdd.count === 20) } - val variantContexts: VariantContextRDD = fragments.transmute[VariantContext, VariantContextProduct, VariantContextRDD]( + val variantContexts: VariantContextDataset = fragments.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( (rdd: RDD[Fragment]) => { - rdd.map(FragmentRDDSuite.vcFn) + rdd.map(FragmentDatasetSuite.vcFn) }) checkSave(variantContexts) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala similarity index 88% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala index 76e6890f62..a68fde960d 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala @@ -40,13 +40,13 @@ import org.bdgenomics.adam.rdd.{ ADAMContext, TestSaveArgs } -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } +import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.variant.{ - GenotypeRDD, - VariantRDD, - VariantContextRDD, + GenotypeDataset, + VariantDataset, + VariantContextDataset, VCFOutFormatter } import org.bdgenomics.adam.sql.{ @@ -75,14 +75,14 @@ private object SequenceIndexWithReadOrdering extends Ordering[((Int, Long), (Ali } } -class SameTypeFunction2 extends Function2[AlignmentRecordRDD, RDD[AlignmentRecord], AlignmentRecordRDD] { +class SameTypeFunction2 extends Function2[AlignmentRecordDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - def call(v1: AlignmentRecordRDD, v2: RDD[AlignmentRecord]): AlignmentRecordRDD = { + def call(v1: AlignmentRecordDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { ADAMContext.alignmentRecordsToAlignmentRecordsConversionFn(v1, v2) } } -object AlignmentRecordRDDSuite extends Serializable { +object AlignmentRecordDatasetSuite extends Serializable { private def fragToRead(f: Fragment): AlignmentRecord = { f.getAlignments().get(0) @@ -153,7 +153,7 @@ object AlignmentRecordRDDSuite extends Serializable { } } -class AlignmentRecordRDDSuite extends ADAMFunSuite { +class AlignmentRecordDatasetSuite extends ADAMFunSuite { sparkTest("sorting reads") { val random = new Random("sorting".hashCode) @@ -175,7 +175,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val contigNames = rdd.flatMap(r => Option(r.getContigName)).distinct.collect val sd = new SequenceDictionary(contigNames.map(v => SequenceRecord(v, 1000000L)).toVector) - val sortedReads = AlignmentRecordRDD(rdd, sd, RecordGroupDictionary.empty, Seq.empty) + val sortedReads = AlignmentRecordDataset(rdd, sd, RecordGroupDictionary.empty, Seq.empty) .sortReadsByReferencePosition() .rdd .collect() @@ -201,7 +201,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("coverage does not fail on unmapped reads") { val inputPath = testFile("unmapped.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) .transform(rdd => { rdd.filter(!_.getReadMapped) }) @@ -212,16 +212,16 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("computes coverage") { val inputPath = testFile("artificial.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) // get pileup at position 30 val pointCoverage = reads.filterByOverlappingRegion(ReferenceRegion("artificial", 30, 31)).rdd.count - def testCoverage(coverage: CoverageRDD) { + def testCoverage(coverage: CoverageDataset) { assert(coverage.rdd.filter(r => r.start == 30).first.count == pointCoverage) } - val coverageRdd = reads.toCoverage() - testCoverage(coverageRdd) + val coverage = reads.toCoverage() + testCoverage(coverage) // test dataset path val readsDs = reads.transformDataset(ds => ds) @@ -235,9 +235,9 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val relativePath = new File(testFile("NA12878.1_854950_855150.sam")).getParentFile.getPath val inputPath = relativePath + "/{NA12878.1_854950_855150,bqsr1}.sam" - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) - def countBySampleId(coverage: CoverageRDD, sampleId: String): Long = { + def countBySampleId(coverage: CoverageDataset, sampleId: String): Long = { coverage.rdd.filter(r => r.optSampleId == Some(sampleId)).count } @@ -248,10 +248,10 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("merges adjacent records with equal coverage values") { val inputPath = testFile("artificial.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) - // repartition reads to 1 partition to acheive maximal merging of coverage - val coverage: CoverageRDD = reads.transform(_.repartition(1)) + // repartition reads to 1 partition to achieve maximal merging of coverage + val coverage: CoverageDataset = reads.transform(_.repartition(1)) .toCoverage() .sort() .collapse() @@ -283,7 +283,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { }).toVector) val rdd = sc.parallelize(reads) - val sortedReads = AlignmentRecordRDD(rdd, sd, RecordGroupDictionary.empty, Seq.empty) + val sortedReads = AlignmentRecordDataset(rdd, sd, RecordGroupDictionary.empty, Seq.empty) .sortReadsByReferencePositionAndIndex() .rdd .collect() @@ -653,7 +653,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsParquet with save args, sequence dictionary, and record group dictionary") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation() reads.saveAsParquet(TestSaveArgs(outputPath)) val unfilteredReads = sc.loadAlignments(outputPath) @@ -665,12 +665,12 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { } sparkTest("load parquet to sql, save, re-read from avro") { - def testMetadata(arRdd: AlignmentRecordRDD) { + def testMetadata(arRdd: AlignmentRecordDataset) { val sequenceRdd = arRdd.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) - val rgRdd = arRdd.addRecordGroup(RecordGroup("test", "aRg")) - assert(rgRdd.recordGroups("aRg").sample === "test") + val rgDataset = arRdd.addRecordGroup(RecordGroup("test", "aRg")) + assert(rgDataset.recordGroups("aRg").sample === "test") } val inputPath = testFile("small.sam") @@ -701,12 +701,12 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { } sparkTest("load from sam, save as partitioned parquet, and re-read from partitioned parquet") { - def testMetadata(arRdd: AlignmentRecordRDD) { + def testMetadata(arRdd: AlignmentRecordDataset) { val sequenceRdd = arRdd.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) - val rgRdd = arRdd.addRecordGroup(RecordGroup("test", "aRg")) - assert(rgRdd.recordGroups("aRg").sample === "test") + val rgDataset = arRdd.addRecordGroup(RecordGroup("test", "aRg")) + assert(rgDataset.recordGroups("aRg").sample === "test") } val inputPath = testFile("multi_chr.sam") @@ -741,7 +741,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("save as SAM format") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".sam") reads.save(TestSaveArgs(outputPath)) assert(new File(outputPath).exists()) @@ -749,7 +749,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("save as sorted SAM format") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".sam") reads.save(TestSaveArgs(outputPath), true) assert(new File(outputPath).exists()) @@ -757,7 +757,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("save as BAM format") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".bam") reads.save(TestSaveArgs(outputPath)) assert(new File(outputPath).exists()) @@ -765,7 +765,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("save as sorted BAM format") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".bam") reads.save(TestSaveArgs(outputPath), true) assert(new File(outputPath).exists()) @@ -773,7 +773,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("save as FASTQ format") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".fq") reads.save(TestSaveArgs(outputPath)) assert(new File(outputPath).exists()) @@ -781,7 +781,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("save as ADAM parquet format") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".adam") reads.save(TestSaveArgs(outputPath)) assert(new File(outputPath).exists()) @@ -789,7 +789,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsSam SAM format") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".sam") reads.saveAsSam(outputPath, asType = Some(SAMFormat.SAM)) assert(new File(outputPath).exists()) @@ -797,7 +797,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsSam SAM format single file") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".sam") reads.saveAsSam(outputPath, asType = Some(SAMFormat.SAM), @@ -807,7 +807,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsSam sorted SAM format single file") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".sam") reads.saveAsSam(outputPath, asType = Some(SAMFormat.SAM), @@ -818,7 +818,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsSam BAM format") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".bam") reads.saveAsSam(outputPath, asType = Some(SAMFormat.BAM)) assert(new File(outputPath).exists()) @@ -826,7 +826,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsSam BAM format single file") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".bam") reads.saveAsSam(outputPath, asType = Some(SAMFormat.BAM), @@ -836,7 +836,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsSam sorted BAM format single file") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".bam") reads.saveAsSam(outputPath, asType = Some(SAMFormat.BAM), @@ -847,7 +847,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsFastq") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".fq") reads.saveAsFastq(outputPath, fileName2Opt = None) assert(new File(outputPath).exists()) @@ -855,7 +855,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsFastq as single file") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".fq") reads.saveAsFastq(outputPath, fileName2Opt = None, asSingleFile = true) val outputFile = new File(outputPath) @@ -864,7 +864,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsFastq with original base qualities") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".fq") reads.saveAsFastq(outputPath, fileName2Opt = None, outputOriginalBaseQualities = true) assert(new File(outputPath).exists()) @@ -872,7 +872,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsFastq sorted by read name") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".fq") reads.saveAsFastq(outputPath, fileName2Opt = None, outputOriginalBaseQualities = false, sort = true) assert(new File(outputPath).exists()) @@ -880,7 +880,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsFastq sorted by read name with original base qualities") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath = tmpLocation(".fq") reads.saveAsFastq(outputPath, fileName2Opt = None, outputOriginalBaseQualities = true, sort = true) assert(new File(outputPath).exists()) @@ -888,7 +888,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsFastq paired FASTQ") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath1 = tmpLocation("_1.fq") val outputPath2 = tmpLocation("_2.fq") reads.saveAsFastq(outputPath1, Some(outputPath2)) @@ -898,7 +898,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsPairedFastq") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath1 = tmpLocation("_1.fq") val outputPath2 = tmpLocation("_2.fq") reads.saveAsPairedFastq(outputPath1, outputPath2) @@ -908,7 +908,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { sparkTest("saveAsPairedFastq as single files") { val inputPath = testFile("small.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) val outputPath1 = tmpLocation("_1.fq") val outputPath2 = tmpLocation("_2.fq") reads.saveAsPairedFastq(outputPath1, outputPath2, asSingleFile = true) @@ -926,7 +926,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { implicit val tFormatter = SAMInFormatter implicit val uFormatter = new AnySAMOutFormatter - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, SAMInFormatter](Seq("tee", "/dev/null")) + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, SAMInFormatter](Seq("tee", "/dev/null")) val newRecords = pipedRdd.rdd.count assert(records === newRecords) @@ -939,7 +939,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { implicit val tFormatter = SAMInFormatter implicit val uFormatter = new AnySAMOutFormatter - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, SAMInFormatter](Seq("sleep", "10"), optTimeout = Some(5)) + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, SAMInFormatter](Seq("sleep", "10"), optTimeout = Some(5)) val newRecords = pipedRdd.rdd.count assert(newRecords === 0) } @@ -954,7 +954,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { // this script reads the reads into a temp file, which is then read to // stdout, then we sleep for 10 sec, then we read to stdout again val scriptPath = testFile("timeout.py") - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, SAMInFormatter](Seq("python", "$0"), + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, SAMInFormatter](Seq("python", "$0"), files = Seq(scriptPath)) val newRecords = pipedRdd.rdd.count assert(newRecords === (2 * ardd.rdd.count)) @@ -970,7 +970,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { // this script reads the reads into a temp file, which is then read to // stdout, then we sleep for 10 sec, then we read to stdout again val scriptPath = testFile("timeout.py") - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, SAMInFormatter](Seq("python", "$0"), + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, SAMInFormatter](Seq("python", "$0"), optTimeout = Some(5), files = Seq(scriptPath)) val newRecords = pipedRdd.rdd.count @@ -982,7 +982,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val ardd = sc.loadBam(reads12Path) val records = ardd.rdd.count - val pipedRdd = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, SAMInFormatter]( + val pipedRdd = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, SAMInFormatter]( Seq("tee", "/dev/null"), (List.empty[String]: java.util.List[String]), (Map.empty[String, String]: java.util.Map[String, String]), @@ -1002,14 +1002,14 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { implicit val tFormatter = BAMInFormatter implicit val uFormatter = new AnySAMOutFormatter - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, BAMInFormatter](Seq("tee", "/dev/null")) + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, BAMInFormatter](Seq("tee", "/dev/null")) val newRecords = pipedRdd.rdd.count assert(records === newRecords) } sparkTest("don't lose any reads when piping fastq to sam") { // write suffixes at end of reads - sc.hadoopConfiguration.setBoolean(AlignmentRecordRDD.WRITE_SUFFIXES, true) + sc.hadoopConfiguration.setBoolean(AlignmentRecordDataset.WRITE_SUFFIXES, true) val fragmentsPath = testFile("interleaved_fastq_sample1.ifq") val ardd = sc.loadFragments(fragmentsPath).toReads @@ -1024,7 +1024,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { // this script converts interleaved fastq to unaligned sam val scriptPath = testFile("fastq_to_usam.py") - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, FASTQInFormatter](Seq("python", "$0"), + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, FASTQInFormatter](Seq("python", "$0"), files = Seq(scriptPath)) val newRecords = pipedRdd.rdd.count assert(records === newRecords) @@ -1042,7 +1042,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { implicit val tFormatter = SAMInFormatter implicit val uFormatter = new AnySAMOutFormatter - val pipedRdd: AlignmentRecordRDD = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD, SAMInFormatter](Seq("/bin/bash", scriptPath), + val pipedRdd: AlignmentRecordDataset = ardd.pipe[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset, SAMInFormatter](Seq("/bin/bash", scriptPath), environment = Map(("INPUT_PATH" -> smallPath), ("OUTPUT_PATH" -> writePath))) val newRecords = pipedRdd.rdd.count @@ -1059,7 +1059,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { implicit val tFormatter = SAMInFormatter implicit val uFormatter = new VCFOutFormatter(sc.hadoopConfiguration) - val pipedRdd: VariantContextRDD = ardd.pipe[VariantContext, VariantContextProduct, VariantContextRDD, SAMInFormatter](Seq("/bin/bash", "$0", tempPath, "$1"), + val pipedRdd: VariantContextDataset = ardd.pipe[VariantContext, VariantContextProduct, VariantContextDataset, SAMInFormatter](Seq("/bin/bash", "$0", tempPath, "$1"), files = Seq(scriptPath, vcfPath)) val newRecords = pipedRdd.rdd.count assert(newRecords === 6) @@ -1142,7 +1142,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val sd = SequenceDictionary(SequenceRecord("chr1", 51L), SequenceRecord("chr2", 51L)) - val reads = RDDBoundAlignmentRecordRDD(sc.parallelize(Seq(makeReadAndRegion(0, "chr1", 10L, 20L), + val reads = RDDBoundAlignmentRecordDataset(sc.parallelize(Seq(makeReadAndRegion(0, "chr1", 10L, 20L), makeReadAndRegion(1, "chr1", 40L, 50L), makeReadAndRegion(1, "chr2", 10L, 20L), makeReadAndRegion(1, "chr2", 20L, 30L), @@ -1157,7 +1157,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { Some(ReferenceRegion("chr1", 40L, 50L), ReferenceRegion("chr2", 20L, 30L)), Some(ReferenceRegion("chr2", 40L, 50L), ReferenceRegion("chr2", 40L, 50L))))) - val features = FeatureRDD(sc.parallelize(Seq(Feature.newBuilder + val features = FeatureDataset(sc.parallelize(Seq(Feature.newBuilder .setContigName("chr2") .setStart(20L) .setEnd(50L) @@ -1191,8 +1191,8 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { assert(jRdd.dataset.count === 5) assert(jRdd0.rdd.count === 5) - val joinedReads: AlignmentRecordRDD = jRdd - .transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]((rdd: RDD[(AlignmentRecord, Feature)]) => { + val joinedReads: AlignmentRecordDataset = jRdd + .transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]((rdd: RDD[(AlignmentRecord, Feature)]) => { rdd.map(_._1) }) val tempPath = tmpLocation(".sam") @@ -1438,19 +1438,19 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { assert(kmerCounts.toDF().where($"kmer" === "CCAAGA" && $"count" === 3).count === 1) } - sparkTest("transform reads to contig rdd") { + sparkTest("transform reads to contig genomic dataset") { val reads = sc.loadAlignments(testFile("small.sam")) - def checkSave(ncRdd: NucleotideContigFragmentRDD) { + def checkSave(ncRdd: NucleotideContigFragmentDataset) { val tempPath = tmpLocation(".fa") ncRdd.saveAsFasta(tempPath) assert(sc.loadContigFragments(tempPath).rdd.count.toInt === 20) } - val features: NucleotideContigFragmentRDD = reads.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val features: NucleotideContigFragmentDataset = reads.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (rdd: RDD[AlignmentRecord]) => { - rdd.map(AlignmentRecordRDDSuite.ncfFn) + rdd.map(AlignmentRecordDatasetSuite.ncfFn) }) checkSave(features) @@ -1458,30 +1458,30 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: NucleotideContigFragmentRDD = reads.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val featuresDs: NucleotideContigFragmentDataset = reads.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (ds: Dataset[AlignmentRecordProduct]) => { ds.map(r => { NucleotideContigFragmentProduct.fromAvro( - AlignmentRecordRDDSuite.ncfFn(r.toAvro)) + AlignmentRecordDatasetSuite.ncfFn(r.toAvro)) }) }) checkSave(featuresDs) } - sparkTest("transform reads to coverage rdd") { + sparkTest("transform reads to coverage genomic dataset") { val reads = sc.loadAlignments(testFile("small.sam")) - def checkSave(coverage: CoverageRDD) { + def checkSave(coverage: CoverageDataset) { val tempPath = tmpLocation(".bed") coverage.save(tempPath, false, false) assert(sc.loadCoverage(tempPath).rdd.count === 20) } - val coverage: CoverageRDD = reads.transmute[Coverage, Coverage, CoverageRDD]( + val coverage: CoverageDataset = reads.transmute[Coverage, Coverage, CoverageDataset]( (rdd: RDD[AlignmentRecord]) => { - rdd.map(AlignmentRecordRDDSuite.covFn) + rdd.map(AlignmentRecordDatasetSuite.covFn) }) checkSave(coverage) @@ -1489,27 +1489,27 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val coverageDs: CoverageRDD = reads.transmuteDataset[Coverage, Coverage, CoverageRDD]( + val coverageDs: CoverageDataset = reads.transmuteDataset[Coverage, Coverage, CoverageDataset]( (ds: Dataset[AlignmentRecordProduct]) => { - ds.map(r => AlignmentRecordRDDSuite.covFn(r.toAvro)) + ds.map(r => AlignmentRecordDatasetSuite.covFn(r.toAvro)) }) checkSave(coverageDs) } - sparkTest("transform reads to feature rdd") { + sparkTest("transform reads to feature genomic dataset") { val reads = sc.loadAlignments(testFile("small.sam")) - def checkSave(features: FeatureRDD) { + def checkSave(features: FeatureDataset) { val tempPath = tmpLocation(".bed") features.saveAsBed(tempPath) assert(sc.loadFeatures(tempPath).rdd.count === 20) } - val features: FeatureRDD = reads.transmute[Feature, FeatureProduct, FeatureRDD]( + val features: FeatureDataset = reads.transmute[Feature, FeatureProduct, FeatureDataset]( (rdd: RDD[AlignmentRecord]) => { - rdd.map(AlignmentRecordRDDSuite.featFn) + rdd.map(AlignmentRecordDatasetSuite.featFn) }) checkSave(features) @@ -1517,30 +1517,30 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: FeatureRDD = reads.transmuteDataset[Feature, FeatureProduct, FeatureRDD]( + val featuresDs: FeatureDataset = reads.transmuteDataset[Feature, FeatureProduct, FeatureDataset]( (ds: Dataset[AlignmentRecordProduct]) => { ds.map(r => { FeatureProduct.fromAvro( - AlignmentRecordRDDSuite.featFn(r.toAvro)) + AlignmentRecordDatasetSuite.featFn(r.toAvro)) }) }) checkSave(featuresDs) } - sparkTest("transform reads to fragment rdd") { + sparkTest("transform reads to fragment genomic dataset") { val reads = sc.loadAlignments(testFile("small.sam")) - def checkSave(fragments: FragmentRDD) { + def checkSave(fragments: FragmentDataset) { val tempPath = tmpLocation(".adam") fragments.saveAsParquet(tempPath) assert(sc.loadFragments(tempPath).rdd.count === 20) } - val fragments: FragmentRDD = reads.transmute[Fragment, FragmentProduct, FragmentRDD]( + val fragments: FragmentDataset = reads.transmute[Fragment, FragmentProduct, FragmentDataset]( (rdd: RDD[AlignmentRecord]) => { - rdd.map(AlignmentRecordRDDSuite.fragFn) + rdd.map(AlignmentRecordDatasetSuite.fragFn) }) checkSave(fragments) @@ -1548,30 +1548,30 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val fragmentsDs: FragmentRDD = reads.transmuteDataset[Fragment, FragmentProduct, FragmentRDD]( + val fragmentsDs: FragmentDataset = reads.transmuteDataset[Fragment, FragmentProduct, FragmentDataset]( (ds: Dataset[AlignmentRecordProduct]) => { ds.map(r => { FragmentProduct.fromAvro( - AlignmentRecordRDDSuite.fragFn(r.toAvro)) + AlignmentRecordDatasetSuite.fragFn(r.toAvro)) }) }) checkSave(fragmentsDs) } - sparkTest("transform reads to genotype rdd") { + sparkTest("transform reads to genotype genomic dataset") { val reads = sc.loadAlignments(testFile("small.sam")) - def checkSave(genotypes: GenotypeRDD) { + def checkSave(genotypes: GenotypeDataset) { val tempPath = tmpLocation(".adam") genotypes.saveAsParquet(tempPath) assert(sc.loadGenotypes(tempPath).rdd.count === 20) } - val genotypes: GenotypeRDD = reads.transmute[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypes: GenotypeDataset = reads.transmute[Genotype, GenotypeProduct, GenotypeDataset]( (rdd: RDD[AlignmentRecord]) => { - rdd.map(AlignmentRecordRDDSuite.genFn) + rdd.map(AlignmentRecordDatasetSuite.genFn) }) checkSave(genotypes) @@ -1579,30 +1579,30 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val genotypesDs: GenotypeRDD = reads.transmuteDataset[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypesDs: GenotypeDataset = reads.transmuteDataset[Genotype, GenotypeProduct, GenotypeDataset]( (ds: Dataset[AlignmentRecordProduct]) => { ds.map(r => { GenotypeProduct.fromAvro( - AlignmentRecordRDDSuite.genFn(r.toAvro)) + AlignmentRecordDatasetSuite.genFn(r.toAvro)) }) }) checkSave(genotypesDs) } - sparkTest("transform reads to variant rdd") { + sparkTest("transform reads to variant genomic dataset") { val reads = sc.loadAlignments(testFile("small.sam")) - def checkSave(variants: VariantRDD) { + def checkSave(variants: VariantDataset) { val tempPath = tmpLocation(".adam") variants.saveAsParquet(tempPath) assert(sc.loadVariants(tempPath).rdd.count === 20) } - val variants: VariantRDD = reads.transmute[Variant, VariantProduct, VariantRDD]( + val variants: VariantDataset = reads.transmute[Variant, VariantProduct, VariantDataset]( (rdd: RDD[AlignmentRecord]) => { - rdd.map(AlignmentRecordRDDSuite.varFn) + rdd.map(AlignmentRecordDatasetSuite.varFn) }) checkSave(variants) @@ -1610,11 +1610,11 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val variantsDs: VariantRDD = reads.transmuteDataset[Variant, VariantProduct, VariantRDD]( + val variantsDs: VariantDataset = reads.transmuteDataset[Variant, VariantProduct, VariantDataset]( (ds: Dataset[AlignmentRecordProduct]) => { ds.map(r => { VariantProduct.fromAvro( - AlignmentRecordRDDSuite.varFn(r.toAvro)) + AlignmentRecordDatasetSuite.varFn(r.toAvro)) }) }) @@ -1623,12 +1623,12 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { test("cannot have a null processing step ID") { intercept[IllegalArgumentException] { - AlignmentRecordRDD.processingStepToSam(ProcessingStep.newBuilder.build) + AlignmentRecordDataset.processingStepToSam(ProcessingStep.newBuilder.build) } } test("convert a processing description to htsjdk") { - val htsjdkPg = AlignmentRecordRDD.processingStepToSam( + val htsjdkPg = AlignmentRecordDataset.processingStepToSam( ProcessingStep.newBuilder() .setId("pg") .setProgramName("myProgram") @@ -1643,18 +1643,18 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { assert(htsjdkPg.getPreviousProgramGroupId === "ppg") } - sparkTest("GenomicRDD.sort does not fail on unmapped reads") { + sparkTest("GenomicDataset.sort does not fail on unmapped reads") { val inputPath = testFile("unmapped.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) assert(reads.rdd.count === 200) val sorted = reads.sort(stringency = ValidationStringency.SILENT) assert(sorted.rdd.count === 102) } - sparkTest("GenomicRDD.sortLexicographically does not fail on unmapped reads") { + sparkTest("GenomicDataset.sortLexicographically does not fail on unmapped reads") { val inputPath = testFile("unmapped.sam") - val reads: AlignmentRecordRDD = sc.loadAlignments(inputPath) + val reads: AlignmentRecordDataset = sc.loadAlignments(inputPath) assert(reads.rdd.count === 200) val sorted = reads.sortLexicographically( @@ -1690,7 +1690,7 @@ class AlignmentRecordRDDSuite extends ADAMFunSuite { .build()) // obviously, this isn't unaligned, but, we don't use the metadata here - val rdd = AlignmentRecordRDD.unaligned(sc.parallelize(reads)) + val rdd = AlignmentRecordDataset.unaligned(sc.parallelize(reads)) .leftNormalizeIndels() val normalized = rdd.rdd.collect diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala index bd6c19b411..d8db373be0 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MarkDuplicatesSuite.scala @@ -98,7 +98,7 @@ class MarkDuplicatesSuite extends ADAMFunSuite { } private def markDuplicates(reads: AlignmentRecord*): Array[AlignmentRecord] = { - AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, rgd, Seq.empty) + AlignmentRecordDataset(sc.parallelize(reads), SequenceDictionary.empty, rgd, Seq.empty) .markDuplicates() .rdd .collect() @@ -207,7 +207,7 @@ class MarkDuplicatesSuite extends ADAMFunSuite { } private def markDuplicateFragments(reads: AlignmentRecord*): Array[AlignmentRecord] = { - AlignmentRecordRDD(sc.parallelize(reads), SequenceDictionary.empty, rgd, Seq.empty) + AlignmentRecordDataset(sc.parallelize(reads), SequenceDictionary.empty, rgd, Seq.empty) .toFragments .markDuplicates() .toReads diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndelsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndelsSuite.scala index 5d8d7138d3..a6caf2c2b8 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndelsSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndelsSuite.scala @@ -32,15 +32,15 @@ import org.bdgenomics.adam.models.{ SequenceRecord } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD -import org.bdgenomics.adam.rdd.variant.VariantRDD +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.variant.VariantDataset import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.adam.util.{ ADAMFunSuite, ReferenceFile } import org.bdgenomics.formats.avro.{ AlignmentRecord, Contig, Variant } class RealignIndelsSuite extends ADAMFunSuite { - def artificialReadsRdd: AlignmentRecordRDD = { + def artificialReadsRdd: AlignmentRecordDataset = { val path = testFile("artificial.sam") sc.loadAlignments(path) } @@ -225,7 +225,7 @@ class RealignIndelsSuite extends ADAMFunSuite { .setReferenceAllele("AGGGGGGGGGG") .setAlternateAllele("A") .build - val variantRdd = VariantRDD(sc.parallelize(Seq(indel)), + val variantRdd = VariantDataset(sc.parallelize(Seq(indel)), artificialReadsRdd.sequences, DefaultHeaderLines.allHeaderLines) val knowns = ConsensusGenerator.fromKnownIndels(variantRdd) val artificialRealignedReadsCollected = artificialRealignedReads(cg = knowns) @@ -253,7 +253,7 @@ class RealignIndelsSuite extends ADAMFunSuite { .setReferenceAllele("AGGGGGGGGGG") .setAlternateAllele("A") .build - val variantRdd = VariantRDD(sc.parallelize(Seq(indel)), + val variantRdd = VariantDataset(sc.parallelize(Seq(indel)), artificialReadsRdd.sequences, DefaultHeaderLines.allHeaderLines) val knowns = ConsensusGenerator.fromKnownIndels(variantRdd) val union = ConsensusGenerator.union(knowns, ConsensusGenerator.fromReads) @@ -541,7 +541,7 @@ class RealignIndelsSuite extends ADAMFunSuite { .setMapq(45) .build - val rdd = AlignmentRecordRDD(sc.parallelize(Seq(insRead, + val rdd = AlignmentRecordDataset(sc.parallelize(Seq(insRead, extRead, ovlRead, ovsRead, diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala similarity index 86% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala index 83bb34d6c7..f6764d957e 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala @@ -31,10 +31,10 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, @@ -47,7 +47,7 @@ import org.bdgenomics.adam.sql.{ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ -object GenotypeRDDSuite extends Serializable { +object GenotypeDatasetSuite extends Serializable { def covFn(g: Genotype): Coverage = { Coverage(g.getContigName, @@ -101,11 +101,11 @@ object GenotypeRDDSuite extends Serializable { } } -class GenotypeRDDSuite extends ADAMFunSuite { +class GenotypeDatasetSuite extends ADAMFunSuite { val tempDir = Files.createTempDir() - sparkTest("union two genotype rdds together") { + sparkTest("union two genotype genomic datasets together") { val genotype1 = sc.loadGenotypes(testFile("gvcf_dir/gvcf_multiallelic.g.vcf")) val genotype2 = sc.loadGenotypes(testFile("small.vcf")) val union = genotype1.union(genotype2) @@ -188,8 +188,8 @@ class GenotypeRDDSuite extends ADAMFunSuite { assert(jRdd.rdd.count === 9L) assert(jRdd0.rdd.count === 9L) - val joinedGenotypes: GenotypeRDD = jRdd - .transmute[Genotype, GenotypeProduct, GenotypeRDD]((rdd: RDD[(Genotype, Feature)]) => { + val joinedGenotypes: GenotypeDataset = jRdd + .transmute[Genotype, GenotypeProduct, GenotypeDataset]((rdd: RDD[(Genotype, Feature)]) => { rdd.map(_._1) }) val tempPath = tmpLocation(".adam") @@ -351,14 +351,14 @@ class GenotypeRDDSuite extends ADAMFunSuite { } sparkTest("load parquet to sql, save, re-read from avro") { - def testMetadata(gRdd: GenotypeRDD) { - val sequenceRdd = gRdd.addSequence(SequenceRecord("aSequence", 1000L)) + def testMetadata(gDataset: GenotypeDataset) { + val sequenceRdd = gDataset.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) - val headerRdd = gRdd.addHeaderLine(new VCFHeaderLine("ABC", "123")) + val headerRdd = gDataset.addHeaderLine(new VCFHeaderLine("ABC", "123")) assert(headerRdd.headerLines.exists(_.getKey == "ABC")) - val sampleRdd = gRdd.addSample(Sample.newBuilder + val sampleRdd = gDataset.addSample(Sample.newBuilder .setSampleId("aSample") .build) assert(sampleRdd.samples.exists(_.getSampleId == "aSample")) @@ -385,19 +385,19 @@ class GenotypeRDDSuite extends ADAMFunSuite { assert(rdd3.dataset.count === 18) } - sparkTest("transform genotypes to contig rdd") { + sparkTest("transform genotypes to contig genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentRDD) { + def checkSave(contigs: NucleotideContigFragmentDataset) { val tempPath = tmpLocation(".adam") contigs.saveAsParquet(tempPath) assert(sc.loadContigFragments(tempPath).rdd.count === 18) } - val contigs: NucleotideContigFragmentRDD = genotypes.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigs: NucleotideContigFragmentDataset = genotypes.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeRDDSuite.ncfFn) + rdd.map(GenotypeDatasetSuite.ncfFn) }) checkSave(contigs) @@ -405,30 +405,30 @@ class GenotypeRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentRDD = genotypes.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigsDs: NucleotideContigFragmentDataset = genotypes.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (ds: Dataset[GenotypeProduct]) => { ds.map(r => { NucleotideContigFragmentProduct.fromAvro( - GenotypeRDDSuite.ncfFn(r.toAvro)) + GenotypeDatasetSuite.ncfFn(r.toAvro)) }) }) checkSave(contigsDs) } - sparkTest("transform genotypes to coverage rdd") { + sparkTest("transform genotypes to coverage genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(coverage: CoverageRDD) { + def checkSave(coverage: CoverageDataset) { val tempPath = tmpLocation(".bed") coverage.save(tempPath, false, false) assert(sc.loadCoverage(tempPath).rdd.count === 18) } - val coverage: CoverageRDD = genotypes.transmute[Coverage, Coverage, CoverageRDD]( + val coverage: CoverageDataset = genotypes.transmute[Coverage, Coverage, CoverageDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeRDDSuite.covFn) + rdd.map(GenotypeDatasetSuite.covFn) }) checkSave(coverage) @@ -436,27 +436,27 @@ class GenotypeRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val coverageDs: CoverageRDD = genotypes.transmuteDataset[Coverage, Coverage, CoverageRDD]( + val coverageDs: CoverageDataset = genotypes.transmuteDataset[Coverage, Coverage, CoverageDataset]( (ds: Dataset[GenotypeProduct]) => { - ds.map(r => GenotypeRDDSuite.covFn(r.toAvro)) + ds.map(r => GenotypeDatasetSuite.covFn(r.toAvro)) }) checkSave(coverageDs) } - sparkTest("transform genotypes to feature rdd") { + sparkTest("transform genotypes to feature genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(features: FeatureRDD) { + def checkSave(features: FeatureDataset) { val tempPath = tmpLocation(".bed") features.save(tempPath, false, false) assert(sc.loadFeatures(tempPath).rdd.count === 18) } - val features: FeatureRDD = genotypes.transmute[Feature, FeatureProduct, FeatureRDD]( + val features: FeatureDataset = genotypes.transmute[Feature, FeatureProduct, FeatureDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeRDDSuite.featFn) + rdd.map(GenotypeDatasetSuite.featFn) }) checkSave(features) @@ -464,30 +464,30 @@ class GenotypeRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featureDs: FeatureRDD = genotypes.transmuteDataset[Feature, FeatureProduct, FeatureRDD]( + val featureDs: FeatureDataset = genotypes.transmuteDataset[Feature, FeatureProduct, FeatureDataset]( (ds: Dataset[GenotypeProduct]) => { ds.map(r => { FeatureProduct.fromAvro( - GenotypeRDDSuite.featFn(r.toAvro)) + GenotypeDatasetSuite.featFn(r.toAvro)) }) }) checkSave(featureDs) } - sparkTest("transform genotypes to fragment rdd") { + sparkTest("transform genotypes to fragment genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(fragments: FragmentRDD) { + def checkSave(fragments: FragmentDataset) { val tempPath = tmpLocation(".adam") fragments.saveAsParquet(tempPath) assert(sc.loadFragments(tempPath).rdd.count === 18) } - val fragments: FragmentRDD = genotypes.transmute[Fragment, FragmentProduct, FragmentRDD]( + val fragments: FragmentDataset = genotypes.transmute[Fragment, FragmentProduct, FragmentDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeRDDSuite.fragFn) + rdd.map(GenotypeDatasetSuite.fragFn) }) checkSave(fragments) @@ -495,30 +495,30 @@ class GenotypeRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val fragmentsDs: FragmentRDD = genotypes.transmuteDataset[Fragment, FragmentProduct, FragmentRDD]( + val fragmentsDs: FragmentDataset = genotypes.transmuteDataset[Fragment, FragmentProduct, FragmentDataset]( (ds: Dataset[GenotypeProduct]) => { ds.map(r => { FragmentProduct.fromAvro( - GenotypeRDDSuite.fragFn(r.toAvro)) + GenotypeDatasetSuite.fragFn(r.toAvro)) }) }) checkSave(fragmentsDs) } - sparkTest("transform genotypes to read rdd") { + sparkTest("transform genotypes to read genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(reads: AlignmentRecordRDD) { + def checkSave(reads: AlignmentRecordDataset) { val tempPath = tmpLocation(".adam") reads.saveAsParquet(tempPath) assert(sc.loadAlignments(tempPath).rdd.count === 18) } - val reads: AlignmentRecordRDD = genotypes.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val reads: AlignmentRecordDataset = genotypes.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeRDDSuite.readFn) + rdd.map(GenotypeDatasetSuite.readFn) }) checkSave(reads) @@ -526,30 +526,30 @@ class GenotypeRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val readsDs: AlignmentRecordRDD = genotypes.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val readsDs: AlignmentRecordDataset = genotypes.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (ds: Dataset[GenotypeProduct]) => { ds.map(r => { AlignmentRecordProduct.fromAvro( - GenotypeRDDSuite.readFn(r.toAvro)) + GenotypeDatasetSuite.readFn(r.toAvro)) }) }) checkSave(readsDs) } - sparkTest("transform genotypes to variant rdd") { + sparkTest("transform genotypes to variant genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(variants: VariantRDD) { + def checkSave(variants: VariantDataset) { val tempPath = tmpLocation(".adam") variants.saveAsParquet(tempPath) assert(sc.loadVariants(tempPath).rdd.count === 18) } - val variants: VariantRDD = genotypes.transmute[Variant, VariantProduct, VariantRDD]( + val variants: VariantDataset = genotypes.transmute[Variant, VariantProduct, VariantDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeRDDSuite.varFn) + rdd.map(GenotypeDatasetSuite.varFn) }) checkSave(variants) @@ -557,27 +557,27 @@ class GenotypeRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val variantsDs: VariantRDD = genotypes.transmuteDataset[Variant, VariantProduct, VariantRDD]( + val variantsDs: VariantDataset = genotypes.transmuteDataset[Variant, VariantProduct, VariantDataset]( (ds: Dataset[GenotypeProduct]) => { ds.map(r => { VariantProduct.fromAvro( - GenotypeRDDSuite.varFn(r.toAvro)) + GenotypeDatasetSuite.varFn(r.toAvro)) }) }) checkSave(variantsDs) } - sparkTest("transform genotypes to variant context rdd") { + sparkTest("transform genotypes to variant context genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(variantContexts: VariantContextRDD) { + def checkSave(variantContexts: VariantContextDataset) { assert(variantContexts.rdd.count === 18) } - val variantContexts: VariantContextRDD = genotypes.transmute[VariantContext, VariantContextProduct, VariantContextRDD]( + val variantContexts: VariantContextDataset = genotypes.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeRDDSuite.vcFn) + rdd.map(GenotypeDatasetSuite.vcFn) }) checkSave(variantContexts) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala similarity index 83% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala index efded5f775..ace1794c94 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala @@ -38,10 +38,10 @@ import org.bdgenomics.adam.models.{ } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.TestSaveArgs -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, @@ -55,11 +55,11 @@ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ import scala.collection.JavaConversions._ -class VariantContextRDDSuite extends ADAMFunSuite { +class VariantContextDatasetSuite extends ADAMFunSuite { val tempDir = Files.createTempDir() - def variants: VariantContextRDD = { + def variants: VariantContextDataset = { val contig = Contig.newBuilder.setContigName("chr11") .setContigLength(249250621L) .build @@ -79,7 +79,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { .setAlleles(List(GenotypeAllele.REF, GenotypeAllele.ALT)) .build - VariantContextRDD(sc.parallelize(List( + VariantContextDataset(sc.parallelize(List( VariantContext(v0, Seq(g0))), 1), SequenceDictionary.fromAvro(Seq(contig)), Seq(Sample.newBuilder() .setSampleId("NA12878") @@ -99,7 +99,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { assert(vc1.rdd.count === 6) } - sparkTest("union two variant context rdds together") { + sparkTest("union two variant context genomic datasets together") { val vc1 = sc.loadVcf(testFile("gvcf_dir/gvcf_multiallelic.g.vcf")) val vc2 = sc.loadVcf(testFile("small.vcf")) val union = vc1.union(vc2) @@ -217,13 +217,13 @@ class VariantContextRDDSuite extends ADAMFunSuite { sparkTest("don't lose any variants when piping as VCF") { val smallVcf = testFile("small.vcf") - val rdd: VariantContextRDD = sc.loadVcf(smallVcf) + val rdd: VariantContextDataset = sc.loadVcf(smallVcf) val records = rdd.rdd.count implicit val tFormatter = VCFInFormatter implicit val uFormatter = new VCFOutFormatter(sc.hadoopConfiguration) - val pipedRdd: VariantContextRDD = rdd.pipe[VariantContext, VariantContextProduct, VariantContextRDD, VCFInFormatter](Seq("tee", "/dev/null")) + val pipedRdd: VariantContextDataset = rdd.pipe[VariantContext, VariantContextProduct, VariantContextDataset, VCFInFormatter](Seq("tee", "/dev/null")) .transform(_.cache()) val newRecords = pipedRdd.rdd.count assert(records === newRecords) @@ -232,13 +232,13 @@ class VariantContextRDDSuite extends ADAMFunSuite { sparkTest("pipe works with empty partitions") { val smallVcf = testFile("small.addctg.vcf") - val rdd: VariantContextRDD = sc.loadVcf(smallVcf) + val rdd: VariantContextDataset = sc.loadVcf(smallVcf) val records = rdd.rdd.count implicit val tFormatter = VCFInFormatter implicit val uFormatter = new VCFOutFormatter(sc.hadoopConfiguration) - val pipedRdd: VariantContextRDD = rdd.pipe[VariantContext, VariantContextProduct, VariantContextRDD, VCFInFormatter](Seq("tee", "/dev/null")) + val pipedRdd: VariantContextDataset = rdd.pipe[VariantContext, VariantContextProduct, VariantContextDataset, VCFInFormatter](Seq("tee", "/dev/null")) .transform(_.cache()) val newRecords = pipedRdd.rdd.count assert(records === newRecords) @@ -247,14 +247,14 @@ class VariantContextRDDSuite extends ADAMFunSuite { sparkTest("don't lose any non-default VCF header lines or attributes when piping as VCF") { val freebayesVcf = testFile("NA12878.chr22.tiny.freebayes.vcf") - val rdd: VariantContextRDD = sc.loadVcf(freebayesVcf) + val rdd: VariantContextDataset = sc.loadVcf(freebayesVcf) val accumulator: CollectionAccumulator[VCFHeaderLine] = sc.collectionAccumulator("headerLines") implicit val tFormatter = VCFInFormatter implicit val uFormatter = new VCFOutFormatter(sc.hadoopConfiguration, Some(accumulator)) - val pipedRdd: VariantContextRDD = rdd.pipe[VariantContext, VariantContextProduct, VariantContextRDD, VCFInFormatter](Seq("tee", "/dev/null")) + val pipedRdd: VariantContextDataset = rdd.pipe[VariantContext, VariantContextProduct, VariantContextDataset, VCFInFormatter](Seq("tee", "/dev/null")) // check for freebayes-specific VCF INFO keys val variant = pipedRdd.toVariants.rdd.first @@ -336,7 +336,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { } sparkTest("test metadata") { - def testMetadata(vRdd: VariantContextRDD) { + def testMetadata(vRdd: VariantContextDataset) { val sequenceRdd = vRdd.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) @@ -354,7 +354,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { sparkTest("save sharded bgzip vcf") { val smallVcf = testFile("bqsr1.vcf") - val rdd: VariantContextRDD = sc.loadVcf(smallVcf) + val rdd: VariantContextDataset = sc.loadVcf(smallVcf) val outputPath = tmpFile("bqsr1.vcf.bgz") rdd.transform(_.repartition(4)).saveAsVcf(outputPath, asSingleFile = false, @@ -367,7 +367,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { sparkTest("save bgzip vcf as single file") { val smallVcf = testFile("small.vcf") - val rdd: VariantContextRDD = sc.loadVcf(smallVcf) + val rdd: VariantContextDataset = sc.loadVcf(smallVcf) val outputPath = tmpFile("small.vcf.bgz") rdd.saveAsVcf(outputPath, asSingleFile = true, @@ -380,7 +380,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { sparkTest("can't save file with non-vcf extension") { val smallVcf = testFile("small.vcf") - val rdd: VariantContextRDD = sc.loadVcf(smallVcf) + val rdd: VariantContextDataset = sc.loadVcf(smallVcf) intercept[IllegalArgumentException] { rdd.saveAsVcf("small.bcf", @@ -391,125 +391,125 @@ class VariantContextRDDSuite extends ADAMFunSuite { } } - sparkTest("transform variant contexts to contig rdd") { + sparkTest("transform variant contexts to contig genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentRDD) { + def checkSave(contigs: NucleotideContigFragmentDataset) { val tempPath = tmpLocation(".adam") contigs.saveAsParquet(tempPath) assert(sc.loadContigFragments(tempPath).rdd.count === 6) } - val contigs: NucleotideContigFragmentRDD = variantContexts.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigs: NucleotideContigFragmentDataset = variantContexts.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (rdd: RDD[VariantContext]) => { - rdd.map(VariantRDDSuite.ncfFn) + rdd.map(VariantDatasetSuite.ncfFn) }) checkSave(contigs) } - sparkTest("transform variant contexts to coverage rdd") { + sparkTest("transform variant contexts to coverage genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(coverage: CoverageRDD) { + def checkSave(coverage: CoverageDataset) { val tempPath = tmpLocation(".bed") coverage.save(tempPath, false, false) assert(sc.loadCoverage(tempPath).rdd.count === 6) } - val coverage: CoverageRDD = variantContexts.transmute[Coverage, Coverage, CoverageRDD]( + val coverage: CoverageDataset = variantContexts.transmute[Coverage, Coverage, CoverageDataset]( (rdd: RDD[VariantContext]) => { - rdd.map(VariantRDDSuite.covFn) + rdd.map(VariantDatasetSuite.covFn) }) checkSave(coverage) } - sparkTest("transform variant contexts to feature rdd") { + sparkTest("transform variant contexts to feature genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(features: FeatureRDD) { + def checkSave(features: FeatureDataset) { val tempPath = tmpLocation(".bed") features.save(tempPath, false, false) assert(sc.loadFeatures(tempPath).rdd.count === 6) } - val features: FeatureRDD = variantContexts.transmute[Feature, FeatureProduct, FeatureRDD]( + val features: FeatureDataset = variantContexts.transmute[Feature, FeatureProduct, FeatureDataset]( (rdd: RDD[VariantContext]) => { - rdd.map(VariantRDDSuite.featFn) + rdd.map(VariantDatasetSuite.featFn) }) checkSave(features) } - sparkTest("transform variant contexts to fragment rdd") { + sparkTest("transform variant contexts to fragment genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(fragments: FragmentRDD) { + def checkSave(fragments: FragmentDataset) { val tempPath = tmpLocation(".adam") fragments.saveAsParquet(tempPath) assert(sc.loadFragments(tempPath).rdd.count === 6) } - val fragments: FragmentRDD = variantContexts.transmute[Fragment, FragmentProduct, FragmentRDD]( + val fragments: FragmentDataset = variantContexts.transmute[Fragment, FragmentProduct, FragmentDataset]( (rdd: RDD[VariantContext]) => { - rdd.map(VariantRDDSuite.fragFn) + rdd.map(VariantDatasetSuite.fragFn) }) checkSave(fragments) } - sparkTest("transform variant contexts to read rdd") { + sparkTest("transform variant contexts to read genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(reads: AlignmentRecordRDD) { + def checkSave(reads: AlignmentRecordDataset) { val tempPath = tmpLocation(".adam") reads.saveAsParquet(tempPath) assert(sc.loadAlignments(tempPath).rdd.count === 6) } - val reads: AlignmentRecordRDD = variantContexts.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val reads: AlignmentRecordDataset = variantContexts.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (rdd: RDD[VariantContext]) => { - rdd.map(VariantRDDSuite.readFn) + rdd.map(VariantDatasetSuite.readFn) }) checkSave(reads) } - sparkTest("transform variant contexts to genotype rdd") { + sparkTest("transform variant contexts to genotype genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(genotypes: GenotypeRDD) { + def checkSave(genotypes: GenotypeDataset) { val tempPath = tmpLocation(".adam") genotypes.saveAsParquet(tempPath) assert(sc.loadGenotypes(tempPath).rdd.count === 6) } - val genotypes: GenotypeRDD = variantContexts.transmute[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypes: GenotypeDataset = variantContexts.transmute[Genotype, GenotypeProduct, GenotypeDataset]( (rdd: RDD[VariantContext]) => { - rdd.map(VariantRDDSuite.genFn) + rdd.map(VariantDatasetSuite.genFn) }) checkSave(genotypes) } - sparkTest("transform variant contexts to variant rdd") { + sparkTest("transform variant contexts to variant genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(variants: VariantRDD) { + def checkSave(variants: VariantDataset) { val tempPath = tmpLocation(".adam") variants.saveAsParquet(tempPath) assert(sc.loadVariants(tempPath).rdd.count === 6) } - val variants: VariantRDD = variantContexts.transmute[Variant, VariantProduct, VariantRDD]( + val variants: VariantDataset = variantContexts.transmute[Variant, VariantProduct, VariantDataset]( (rdd: RDD[VariantContext]) => { rdd.map(_.variant.variant) }) @@ -518,7 +518,7 @@ class VariantContextRDDSuite extends ADAMFunSuite { } sparkTest("save and reload from partitioned parquet") { - def testMetadata(vcs: VariantContextRDD) { + def testMetadata(vcs: VariantContextDataset) { assert(vcs.sequences.containsReferenceName("13")) assert(vcs.samples.isEmpty) assert(vcs.headerLines.exists(_.getKey == "GATKCommandLine")) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala similarity index 85% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantRDDSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala index e48138f5f3..63513303ed 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala @@ -27,10 +27,10 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD -import org.bdgenomics.adam.rdd.feature.{ CoverageRDD, FeatureRDD } -import org.bdgenomics.adam.rdd.fragment.FragmentRDD -import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD +import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset +import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } +import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, @@ -43,7 +43,7 @@ import org.bdgenomics.adam.sql.{ import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro._ -object VariantRDDSuite extends Serializable { +object VariantDatasetSuite extends Serializable { def covFn(v: Variant): Coverage = { Coverage(v.getContigName, @@ -121,9 +121,9 @@ object VariantRDDSuite extends Serializable { } } -class VariantRDDSuite extends ADAMFunSuite { +class VariantDatasetSuite extends ADAMFunSuite { - sparkTest("union two variant rdds together") { + sparkTest("union two variant genomic datasets together") { val variant1 = sc.loadVariants(testFile("gvcf_dir/gvcf_multiallelic.g.vcf")) val variant2 = sc.loadVariants(testFile("small.vcf")) val union = variant1.union(variant2) @@ -209,8 +209,8 @@ class VariantRDDSuite extends ADAMFunSuite { assert(jRdd.rdd.count === 3L) assert(jRdd0.rdd.count === 3L) - val joinedVariants: VariantRDD = jRdd - .transmute[Variant, VariantProduct, VariantRDD]((rdd: RDD[(Variant, Feature)]) => { + val joinedVariants: VariantDataset = jRdd + .transmute[Variant, VariantProduct, VariantDataset]((rdd: RDD[(Variant, Feature)]) => { rdd.map(_._1) }) val tempPath = tmpLocation(".adam") @@ -372,7 +372,7 @@ class VariantRDDSuite extends ADAMFunSuite { } sparkTest("load parquet to sql, save, re-read from avro") { - def testMetadata(vRdd: VariantRDD) { + def testMetadata(vRdd: VariantDataset) { val sequenceRdd = vRdd.addSequence(SequenceRecord("aSequence", 1000L)) assert(sequenceRdd.sequences.containsReferenceName("aSequence")) @@ -401,19 +401,19 @@ class VariantRDDSuite extends ADAMFunSuite { assert(rdd3.dataset.count === 6) } - sparkTest("transform variants to contig rdd") { + sparkTest("transform variants to contig genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentRDD) { + def checkSave(contigs: NucleotideContigFragmentDataset) { val tempPath = tmpLocation(".adam") contigs.saveAsParquet(tempPath) assert(sc.loadContigFragments(tempPath).rdd.count === 6) } - val contigs: NucleotideContigFragmentRDD = variants.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigs: NucleotideContigFragmentDataset = variants.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantRDDSuite.ncfFn) + rdd.map(VariantDatasetSuite.ncfFn) }) checkSave(contigs) @@ -421,30 +421,30 @@ class VariantRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentRDD = variants.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentRDD]( + val contigsDs: NucleotideContigFragmentDataset = variants.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( (ds: Dataset[VariantProduct]) => { ds.map(r => { NucleotideContigFragmentProduct.fromAvro( - VariantRDDSuite.ncfFn(r.toAvro)) + VariantDatasetSuite.ncfFn(r.toAvro)) }) }) checkSave(contigsDs) } - sparkTest("transform variants to coverage rdd") { + sparkTest("transform variants to coverage genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(coverage: CoverageRDD) { + def checkSave(coverage: CoverageDataset) { val tempPath = tmpLocation(".bed") coverage.save(tempPath, false, false) assert(sc.loadCoverage(tempPath).rdd.count === 6) } - val coverage: CoverageRDD = variants.transmute[Coverage, Coverage, CoverageRDD]( + val coverage: CoverageDataset = variants.transmute[Coverage, Coverage, CoverageDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantRDDSuite.covFn) + rdd.map(VariantDatasetSuite.covFn) }) checkSave(coverage) @@ -452,27 +452,27 @@ class VariantRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val coverageDs: CoverageRDD = variants.transmuteDataset[Coverage, Coverage, CoverageRDD]( + val coverageDs: CoverageDataset = variants.transmuteDataset[Coverage, Coverage, CoverageDataset]( (ds: Dataset[VariantProduct]) => { - ds.map(r => VariantRDDSuite.covFn(r.toAvro)) + ds.map(r => VariantDatasetSuite.covFn(r.toAvro)) }) checkSave(coverageDs) } - sparkTest("transform variants to feature rdd") { + sparkTest("transform variants to feature genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(features: FeatureRDD) { + def checkSave(features: FeatureDataset) { val tempPath = tmpLocation(".bed") features.save(tempPath, false, false) assert(sc.loadFeatures(tempPath).rdd.count === 6) } - val features: FeatureRDD = variants.transmute[Feature, FeatureProduct, FeatureRDD]( + val features: FeatureDataset = variants.transmute[Feature, FeatureProduct, FeatureDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantRDDSuite.featFn) + rdd.map(VariantDatasetSuite.featFn) }) checkSave(features) @@ -480,30 +480,30 @@ class VariantRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featureDs: FeatureRDD = variants.transmuteDataset[Feature, FeatureProduct, FeatureRDD]( + val featureDs: FeatureDataset = variants.transmuteDataset[Feature, FeatureProduct, FeatureDataset]( (ds: Dataset[VariantProduct]) => { ds.map(r => { FeatureProduct.fromAvro( - VariantRDDSuite.featFn(r.toAvro)) + VariantDatasetSuite.featFn(r.toAvro)) }) }) checkSave(featureDs) } - sparkTest("transform variants to fragment rdd") { + sparkTest("transform variants to fragment genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(fragments: FragmentRDD) { + def checkSave(fragments: FragmentDataset) { val tempPath = tmpLocation(".adam") fragments.saveAsParquet(tempPath) assert(sc.loadFragments(tempPath).rdd.count === 6) } - val fragments: FragmentRDD = variants.transmute[Fragment, FragmentProduct, FragmentRDD]( + val fragments: FragmentDataset = variants.transmute[Fragment, FragmentProduct, FragmentDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantRDDSuite.fragFn) + rdd.map(VariantDatasetSuite.fragFn) }) checkSave(fragments) @@ -511,30 +511,30 @@ class VariantRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val fragmentsDs: FragmentRDD = variants.transmuteDataset[Fragment, FragmentProduct, FragmentRDD]( + val fragmentsDs: FragmentDataset = variants.transmuteDataset[Fragment, FragmentProduct, FragmentDataset]( (ds: Dataset[VariantProduct]) => { ds.map(r => { FragmentProduct.fromAvro( - VariantRDDSuite.fragFn(r.toAvro)) + VariantDatasetSuite.fragFn(r.toAvro)) }) }) checkSave(fragmentsDs) } - sparkTest("transform variants to read rdd") { + sparkTest("transform variants to read genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(reads: AlignmentRecordRDD) { + def checkSave(reads: AlignmentRecordDataset) { val tempPath = tmpLocation(".adam") reads.saveAsParquet(tempPath) assert(sc.loadAlignments(tempPath).rdd.count === 6) } - val reads: AlignmentRecordRDD = variants.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val reads: AlignmentRecordDataset = variants.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantRDDSuite.readFn) + rdd.map(VariantDatasetSuite.readFn) }) checkSave(reads) @@ -542,30 +542,30 @@ class VariantRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val readsDs: AlignmentRecordRDD = variants.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordRDD]( + val readsDs: AlignmentRecordDataset = variants.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( (ds: Dataset[VariantProduct]) => { ds.map(r => { AlignmentRecordProduct.fromAvro( - VariantRDDSuite.readFn(r.toAvro)) + VariantDatasetSuite.readFn(r.toAvro)) }) }) checkSave(readsDs) } - sparkTest("transform variants to genotype rdd") { + sparkTest("transform variants to genotype genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(genotypes: GenotypeRDD) { + def checkSave(genotypes: GenotypeDataset) { val tempPath = tmpLocation(".adam") genotypes.saveAsParquet(tempPath) assert(sc.loadGenotypes(tempPath).rdd.count === 6) } - val genotypes: GenotypeRDD = variants.transmute[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypes: GenotypeDataset = variants.transmute[Genotype, GenotypeProduct, GenotypeDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantRDDSuite.genFn) + rdd.map(VariantDatasetSuite.genFn) }) checkSave(genotypes) @@ -573,27 +573,27 @@ class VariantRDDSuite extends ADAMFunSuite { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val genotypesDs: GenotypeRDD = variants.transmuteDataset[Genotype, GenotypeProduct, GenotypeRDD]( + val genotypesDs: GenotypeDataset = variants.transmuteDataset[Genotype, GenotypeProduct, GenotypeDataset]( (ds: Dataset[VariantProduct]) => { ds.map(r => { GenotypeProduct.fromAvro( - VariantRDDSuite.genFn(r.toAvro)) + VariantDatasetSuite.genFn(r.toAvro)) }) }) checkSave(genotypesDs) } - sparkTest("transform variants to variant context rdd") { + sparkTest("transform variants to variant context genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(variantContexts: VariantContextRDD) { + def checkSave(variantContexts: VariantContextDataset) { assert(variantContexts.rdd.count === 6) } - val variantContexts: VariantContextRDD = variants.transmute[VariantContext, VariantContextProduct, VariantContextRDD]( + val variantContexts: VariantContextDataset = variants.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantRDDSuite.vcFn) + rdd.map(VariantDatasetSuite.vcFn) }) checkSave(variantContexts) diff --git a/adam-python/bdgenomics/adam/adamContext.py b/adam-python/bdgenomics/adam/adamContext.py index 51c48fc655..454d154856 100644 --- a/adam-python/bdgenomics/adam/adamContext.py +++ b/adam-python/bdgenomics/adam/adamContext.py @@ -26,13 +26,13 @@ ADAMContext """ -from bdgenomics.adam.rdd import AlignmentRecordRDD, \ - CoverageRDD, \ - FeatureRDD, \ - FragmentRDD, \ - GenotypeRDD, \ - NucleotideContigFragmentRDD, \ - VariantRDD +from bdgenomics.adam.rdd import AlignmentRecordDataset, \ + CoverageDataset, \ + FeatureDataset, \ + FragmentDataset, \ + GenotypeDataset, \ + NucleotideContigFragmentDataset, \ + VariantDataset from bdgenomics.adam.stringency import STRICT, _toJava @@ -58,7 +58,7 @@ def __init__(self, ss): def loadAlignments(self, filePath, stringency=STRICT): """ - Load alignment records into an AlignmentRecordRDD. + Load alignment records into an AlignmentRecordDataset. Loads path names ending in: * .bam/.cram/.sam as BAM/CRAM/SAM format, @@ -74,14 +74,14 @@ def loadAlignments(self, filePath, stringency=STRICT): :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. - :return: Returns an RDD containing reads. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + :return: Returns a genomic dataset containing reads. + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ adamRdd = self.__jac.loadAlignments(filePath, _toJava(stringency, self._jvm)) - return AlignmentRecordRDD(adamRdd, self._sc) + return AlignmentRecordDataset(adamRdd, self._sc) def loadIndexedBam(self, @@ -100,11 +100,11 @@ def loadIndexedBam(self, :param int stringency: The validation stringency to use when validating the BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT. - :return Returns an AlignmentRecordRDD which wraps the RDD of alignment + :return Returns an AlignmentRecordDataset which wraps the RDD of alignment records, sequence dictionary representing contigs the alignment records may be aligned to, and the record group dictionary for the alignment records if one is available. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ # translate reference regions into jvm types @@ -113,14 +113,13 @@ def loadIndexedBam(self, adamRdd = self.__jac.loadIndexedBam(filePath, javaRrs, _toJava(stringency, self._jvm)) - - return AlignmentRecordRDD(adamRdd, self._sc) + return AlignmentRecordDataset(adamRdd, self._sc) def loadCoverage(self, filePath, stringency=STRICT): """ - Load features into a FeatureRDD and convert to a CoverageRDD. + Load features into a FeatureDataset and convert to a CoverageDataset. Coverage is stored in the score field of Feature. Loads path names ending in: @@ -138,19 +137,19 @@ def loadCoverage(self, filePath, :param str filePath: The path to load coverage data from. :param stringency: The validation stringency to apply. Defaults to STRICT. - :return: Returns an RDD containing coverage. - :rtype: bdgenomics.adam.rdd.CoverageRDD + :return: Returns a genomic dataset containing coverage. + :rtype: bdgenomics.adam.rdd.CoverageDataset """ adamRdd = self.__jac.loadCoverage(filePath, _toJava(stringency, self._jvm)) - return CoverageRDD(adamRdd, self._sc) + return CoverageDataset(adamRdd, self._sc) def loadContigFragments(self, filePath): """ - Load nucleotide contig fragments into a NucleotideContigFragmentRDD. + Load nucleotide contig fragments into a NucleotideContigFragmentDataset. If the path name has a .fa/.fasta extension, load as FASTA format. Else, fall back to Parquet + Avro. @@ -159,18 +158,18 @@ def loadContigFragments(self, filePath): in Hadoop, which by default include .gz and .bz2, but can include more. :param str filePath: The path to load the file from. - :return: Returns an RDD containing sequence fragments. - :rtype: bdgenomics.adam.rdd.NucleotideContigFragmentRDD + :return: Returns a genomic dataset containing sequence fragments. + :rtype: bdgenomics.adam.rdd.NucleotideContigFragmentDataset """ adamRdd = self.__jac.loadContigFragments(filePath) - return NucleotideContigFragmentRDD(adamRdd, self._sc) + return NucleotideContigFragmentDataset(adamRdd, self._sc) def loadFragments(self, filePath, stringency=STRICT): """ - Load fragments into a FragmentRDD. + Load fragments into a FragmentDataset. Loads path names ending in: * .bam/.cram/.sam as BAM/CRAM/SAM format and @@ -182,18 +181,18 @@ def loadFragments(self, filePath, stringency=STRICT): :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. - :return: Returns an RDD containing sequenced fragments. - :rtype: bdgenomics.adam.rdd.FragmentRDD + :return: Returns a genomic dataset containing sequenced fragments. + :rtype: bdgenomics.adam.rdd.FragmentDataset """ adamRdd = self.__jac.loadFragments(filePath, stringency) - return FragmentRDD(adamRdd, self._sc) + return FragmentDataset(adamRdd, self._sc) def loadFeatures(self, filePath, stringency=STRICT): """ - Load features into a FeatureRDD. + Load features into a FeatureDataset. Loads path names ending in: * .bed as BED6/12 format, @@ -210,49 +209,49 @@ def loadFeatures(self, filePath, stringency=STRICT): :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. - :return: Returns an RDD containing features. - :rtype: bdgenomics.adam.rdd.FeatureRDD + :return: Returns a genomic dataset containing features. + :rtype: bdgenomics.adam.rdd.FeatureDataset """ adamRdd = self.__jac.loadFeatures(filePath, _toJava(stringency, self._jvm)) - return FeatureRDD(adamRdd, self._sc) + return FeatureDataset(adamRdd, self._sc) def loadGenotypes(self, filePath, stringency=STRICT): """ - Load genotypes into a GenotypeRDD. + Load genotypes into a GenotypeDataset. If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. Else, fall back to Parquet + Avro. :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. - :return: Returns an RDD containing genotypes. - :rtype: bdgenomics.adam.rdd.GenotypeRDD + :return: Returns a genomic dataset containing genotypes. + :rtype: bdgenomics.adam.rdd.GenotypeDataset """ adamRdd = self.__jac.loadGenotypes(filePath, _toJava(stringency, self._jvm)) - return GenotypeRDD(adamRdd, self._sc) + return GenotypeDataset(adamRdd, self._sc) def loadVariants(self, filePath, stringency=STRICT): """ - Load variants into a VariantRDD. + Load variants into a VariantDataset. If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. Else, fall back to Parquet + Avro. :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. - :return: Returns an RDD containing variants. - :rtype: bdgenomics.adam.rdd.VariantRDD + :return: Returns a genomic dataset containing variants. + :rtype: bdgenomics.adam.rdd.VariantDataset """ adamRdd = self.__jac.loadVariants(filePath, _toJava(stringency, self._jvm)) - return VariantRDD(adamRdd, self._sc) + return VariantDataset(adamRdd, self._sc) diff --git a/adam-python/bdgenomics/adam/rdd.py b/adam-python/bdgenomics/adam/rdd.py index e99473a903..2fd27ef49d 100644 --- a/adam-python/bdgenomics/adam/rdd.py +++ b/adam-python/bdgenomics/adam/rdd.py @@ -106,7 +106,7 @@ def sort(self): Sorts our genome aligned data by reference positions, with contigs ordered by index. - :return: Returns a new, sorted RDD, of the implementing class type. + :return: Returns a new, sorted genomic dataset, of the implementing class type. """ return self._replaceRdd(self._jvmRdd.sort()) @@ -117,7 +117,7 @@ def sortLexicographically(self): Sorts our genome aligned data by reference positions, with contigs ordered lexicographically - :return: Returns a new, sorted RDD, of the implementing class type. + :return: Returns a new, sorted genomic dataset, of the implementing class type. """ return self._replaceRdd(self._jvmRdd.sortLexicographically()) @@ -156,7 +156,7 @@ def filterByOverlappingRegions(self, querys): def union(self, rdds): """ - Unions together multiple RDDs. + Unions together multiple genomic datasets. :param list rdds: The RDDs to union into this RDD. :return: Returns a new RDD containing the union of this RDD and the other RDDs. @@ -183,8 +183,8 @@ def transform(self, tFn): Applies a function that transforms the underlying DataFrame into a new DataFrame using the Spark SQL API. - :param function tFn: A function that transforms the underlying RDD as a DataFrame. - :return: A new RDD where the RDD of genomic data has been replaced, but the + :param function tFn: A function that transforms the underlying DataFrame as a DataFrame. + :return: A new genomic dataset where the DataFrame of genomic data has been replaced, but the metadata (sequence dictionary, and etc) is copied without modification. """ @@ -196,14 +196,14 @@ def transform(self, tFn): def transmute(self, tFn, destClass, convFn=None): """ - Applies a function that transmutes the underlying DataFrame into a new RDD of a + Applies a function that transmutes the underlying DataFrame into a new genomic dataset of a different type. - :param function tFn: A function that transforms the underlying RDD as a DataFrame. + :param function tFn: A function that transforms the underlying DataFrame as a DataFrame. :param str convFn: The name of the ADAM GenomicDatasetConversion class to use. :param class destClass: The destination class of this transmutation. - :return: A new RDD where the RDD of genomic data has been replaced, but the + :return: A new genomic dataset where the DataFrame of genomic data has been replaced, but the metadata (sequence dictionary, and etc) is copied without modification. """ @@ -228,19 +228,19 @@ def _inferConversionFn(self, destClass): def _destClassSuffix(self, destClass): - if destClass is NucleotideContigFragmentRDD: + if destClass is NucleotideContigFragmentDataset: return "ContigsDatasetConverter" - elif destClass is CoverageRDD: + elif destClass is CoverageDataset: return "CoverageDatasetConverter" - elif destClass is FeatureRDD: + elif destClass is FeatureDataset: return "FeaturesDatasetConverter" - elif destClass is FragmentRDD: + elif destClass is FragmentDataset: return "FragmentDatasetConverter" - elif destClass is AlignmentRecordRDD: + elif destClass is AlignmentRecordDataset: return "AlignmentRecordDatasetConverter" - elif destClass is GenotypeRDD: + elif destClass is GenotypeDataset: return "GenotypeDatasetConverter" - elif destClass is VariantRDD: + elif destClass is VariantDataset: return "VariantDatasetConverter" else: raise ValueError("No conversion method known for %s." % destClass) @@ -279,7 +279,7 @@ def pipe(self, executor. Set to None (default) to omit. :param int flankSize: The number of bases of flanking sequence to have around each partition. Defaults to 0. - :return: Returns a new RDD where the input from the original RDD has + :return: Returns a new genomic dataset where the input from the original genomic dataset has been piped through a command that runs locally on each executor. """ @@ -308,18 +308,18 @@ def pipe(self, def broadcastRegionJoin(self, genomicRdd, flankSize=0): """ - Performs a broadcast inner join between this RDD and another RDD. + Performs a broadcast inner join between this genomic dataset and another genomic dataset. - In a broadcast join, the left RDD (this RDD) is collected to the driver, + In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, and broadcast to all the nodes in the cluster. The key equality function used for this join is the reference region overlap function. Since this is an inner join, all values who do not overlap a value from the other - RDD are dropped. - - :param GenomicDataset genomicRdd: The right RDD in the join. + genomic dataset are dropped. + + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space. """ @@ -330,22 +330,22 @@ def broadcastRegionJoin(self, genomicRdd, flankSize=0): def rightOuterBroadcastRegionJoin(self, genomicRdd, flankSize=0): """ - Performs a broadcast right outer join between this RDD and another RDD. - - In a broadcast join, the left RDD (this RDD) is collected to the driver, + Performs a broadcast right outer join between this genomic dataset and another genomic dataset. + + In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, and broadcast to all the nodes in the cluster. The key equality function used for this join is the reference region overlap function. Since this - is a right outer join, all values in the left RDD that do not overlap a - value from the right RDD are dropped. If a value from the right RDD does - not overlap any values in the left RDD, it will be paired with a `None` + is a right outer join, all values in the left genomic dataset that do not overlap a + value from the right genomic dataset are dropped. If a value from the right genomic dataset does + not overlap any values in the left genomic dataset, it will be paired with a `None` in the product of the join. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space, and all keys from the - right RDD that did not overlap a key in the left RDD. + right genomic dataset that did not overlap a key in the left genomic dataset. """ return GenomicDataset(self._jvmRdd.rightOuterBroadcastRegionJoin(genomicRdd._jvmRdd, @@ -355,18 +355,18 @@ def rightOuterBroadcastRegionJoin(self, genomicRdd, flankSize=0): def broadcastRegionJoinAndGroupByRight(self, genomicRdd, flankSize=0): """ - Performs a broadcast inner join between this RDD and another RDD. + Performs a broadcast inner join between this genomic dataset and another genomic dataset. - In a broadcast join, the left RDD (this RDD) is collected to the driver, + In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, and broadcast to all the nodes in the cluster. The key equality function used for this join is the reference region overlap function. Since this is an inner join, all values who do not overlap a value from the other - RDD are dropped. + genomic dataset are dropped. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space. """ @@ -377,22 +377,21 @@ def broadcastRegionJoinAndGroupByRight(self, genomicRdd, flankSize=0): def rightOuterBroadcastRegionJoinAndGroupByRight(self, genomicRdd, flankSize=0): """ - Performs a broadcast right outer join between this RDD and another RDD. - + Performs a broadcast right outer join between this genomic dataset and another genomic dataset. In a broadcast join, the left side of the join (broadcastTree) is broadcast to to all the nodes in the cluster. The key equality function used for this join is the reference region overlap function. Since this - is a right outer join, all values in the left RDD that do not overlap a - value from the right RDD are dropped. If a value from the right RDD does - not overlap any values in the left RDD, it will be paired with a `None` + is a right outer join, all values in the left genomic dataset that do not overlap a + value from the right genomic dataset are dropped. If a value from the right genomic dataset does + not overlap any values in the left genomic dataset, it will be paired with a `None` in the product of the join. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space, and all keys from the - right RDD that did not overlap a key in the left RDD. + right genomic dataset that did not overlap a key in the left genomic dataset. """ return GenomicDataset(self._jvmRdd.rightOuterBroadcastRegionJoinAndGroupByRight(genomicRdd._jvmRdd, @@ -402,18 +401,18 @@ def rightOuterBroadcastRegionJoinAndGroupByRight(self, genomicRdd, flankSize=0): def shuffleRegionJoin(self, genomicRdd, flankSize=0): """ - Performs a sort-merge inner join between this RDD and another RDD. + Performs a sort-merge inner join between this genomic dataset and another genomic dataset. - In a sort-merge join, both RDDs are co-partitioned and sorted. The + In a sort-merge join, both genomic datasets are co-partitioned and sorted. The partitions are then zipped, and we do a merge join on each partition. The key equality function used for this join is the reference region overlap function. Since this is an inner join, all values who do not - overlap a value from the other RDD are dropped. + overlap a value from the other genomic dataset are dropped. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space. """ @@ -423,22 +422,22 @@ def shuffleRegionJoin(self, genomicRdd, flankSize=0): def rightOuterShuffleRegionJoin(self, genomicRdd, flankSize=0): """ - Performs a sort-merge right outer join between this RDD and another RDD. + Performs a sort-merge right outer join between this genomic dataset and another genomic dataset. - In a sort-merge join, both RDDs are co-partitioned and sorted. The + In a sort-merge join, both genomic datasets are co-partitioned and sorted. The partitions are then zipped, and we do a merge join on each partition. The key equality function used for this join is the reference region overlap function. Since this is a right outer join, all values in the - left RDD that do not overlap a value from the right RDD are dropped. - If a value from the right RDD does not overlap any values in the left - RDD, it will be paired with a `None` in the product of the join. + left genomic dataset that do not overlap a value from the right genomic dataset are dropped. + If a value from the right genomic dataset does not overlap any values in the left + genomic dataset, it will be paired with a `None` in the product of the join. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space, and all keys from the - right RDD that did not overlap a key in the left RDD. + right genomic dataset that did not overlap a key in the left genomic dataset. """ return GenomicDataset(self._jvmRdd.rightOuterShuffleRegionJoin(genomicRdd._jvmRdd, flankSize), @@ -447,22 +446,22 @@ def rightOuterShuffleRegionJoin(self, genomicRdd, flankSize=0): def leftOuterShuffleRegionJoin(self, genomicRdd, flankSize=0): """ - Performs a sort-merge left outer join between this RDD and another RDD. + Performs a sort-merge left outer join between this genomic dataset and another genomic dataset. - In a sort-merge join, both RDDs are co-partitioned and sorted. The + In a sort-merge join, both genomic datasets are co-partitioned and sorted. The partitions are then zipped, and we do a merge join on each partition. The key equality function used for this join is the reference region overlap function. Since this is a left outer join, all values in the - right RDD that do not overlap a value from the left RDD are dropped. - If a value from the left RDD does not overlap any values in the right - RDD, it will be paired with a `None` in the product of the join. + right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + If a value from the left genomic dataset does not overlap any values in the right + genomic dataset, it will be paired with a `None` in the product of the join. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space, and all keys from the - left RDD that did not overlap a key in the left RDD. + left genomic dataset that did not overlap a key in the left genomic dataset. """ return GenomicDataset(self._jvmRdd.leftOuterShuffleRegionJoin(genomicRdd._jvmRdd, flankSize), @@ -471,23 +470,23 @@ def leftOuterShuffleRegionJoin(self, genomicRdd, flankSize=0): def leftOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0): """ - Performs a sort-merge left outer join between this RDD and another RDD, + Performs a sort-merge left outer join between this genomic dataset and another genomic dataset, followed by a groupBy on the left value. - In a sort-merge join, both RDDs are co-partitioned and sorted. The + In a sort-merge join, both genomic datasets are co-partitioned and sorted. The partitions are then zipped, and we do a merge join on each partition. The key equality function used for this join is the reference region overlap function. Since this is a left outer join, all values in the - right RDD that do not overlap a value from the left RDD are dropped. - If a value from the left RDD does not overlap any values in the right - RDD, it will be paired with an empty Iterable in the product of the join. + right genomic dataset that do not overlap a value from the left genomic dataset are dropped. + If a value from the left genomic dataset does not overlap any values in the right + genomic dataset, it will be paired with an empty Iterable in the product of the join. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space, and all keys from the - left RDD that did not overlap a key in the left RDD. + left genomic dataset that did not overlap a key in the left genomic dataset. """ return GenomicDataset(self._jvmRdd.leftOuterShuffleRegionJoinAndGroupByLeft(genomicRdd._jvmRdd, flankSize), @@ -496,19 +495,19 @@ def leftOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0): def fullOuterShuffleRegionJoin(self, genomicRdd, flankSize=0): """ - Performs a sort-merge full outer join between this RDD and another RDD. + Performs a sort-merge full outer join between this genomic dataset and another genomic dataset. - In a sort-merge join, both RDDs are co-partitioned and sorted. The + In a sort-merge join, both genomic datasets are co-partitioned and sorted. The partitions are then zipped, and we do a merge join on each partition. The key equality function used for this join is the reference region overlap function. Since this is a full outer join, if a value from either - RDD does not overlap any values in the other RDD, it will be paired with + genomic dataset does not overlap any values in the other genomic dataset, it will be paired with a `None` in the product of the join. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space, and values that did not overlap will be paired with a `None`. """ @@ -519,24 +518,24 @@ def fullOuterShuffleRegionJoin(self, genomicRdd, flankSize=0): def rightOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0): """ - Performs a sort-merge right outer join between this RDD and another RDD, + Performs a sort-merge right outer join between this genomic dataset and another genomic dataset, followed by a groupBy on the left value, if not null. - In a sort-merge join, both RDDs are co-partitioned and sorted. The + In a sort-merge join, both genomic datasets are co-partitioned and sorted. The partitions are then zipped, and we do a merge join on each partition. The key equality function used for this join is the reference region overlap function. In the same operation, we group all values by the left - item in the RDD. Since this is a right outer join, all values from the - right RDD who did not overlap a value from the left RDD are placed into + item in the genomic dataset. Since this is a right outer join, all values from the + right genomic dataset who did not overlap a value from the left genomic dataset are placed into a length-1 Iterable with a `None` key. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space, grouped together by - the value they overlapped in the left RDD, and all values from the - right RDD that did not overlap an item in the left RDD. + the value they overlapped in the left genomic dataset, and all values from the + right genomic dataset that did not overlap an item in the left genomic dataset. """ return GenomicDataset(self._jvmRdd.rightOuterShuffleRegionJoinAndGroupByLeft(genomicRdd._jvmRdd, flankSize), @@ -545,21 +544,21 @@ def rightOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0): def shuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0): """ - Performs a sort-merge inner join between this RDD and another RDD, + Performs a sort-merge inner join between this genomic dataset and another genomic dataset, followed by a groupBy on the left value. - In a sort-merge join, both RDDs are co-partitioned and sorted. The + In a sort-merge join, both genomic datasets are co-partitioned and sorted. The partitions are then zipped, and we do a merge join on each partition. The key equality function used for this join is the reference region overlap function. In the same operation, we group all values by the left - item in the RDD. + item in the genomic dataset. - :param GenomicDataset genomicRdd: The right RDD in the join. + :param GenomicDataset genomicRdd: The right genomic dataset in the join. :param int flankSize: Sets a flankSize for the distance between elements to be joined. If set to 0, an overlap is required to join two elements. - :return: Returns a new genomic RDD containing all pairs of keys that + :return: Returns a new genomic dataset containing all pairs of keys that overlapped in the genomic coordinate space, grouped together by - the value they overlapped in the left RDD. + the value they overlapped in the left genomic dataset. """ return GenomicDataset(self._jvmRdd.shuffleRegionJoinAndGroupByLeft(genomicRdd._jvmRdd, flankSize), @@ -568,8 +567,8 @@ def shuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0): def toDF(self): """ - Converts this GenomicDatset into a dataframe. - :return: Returns a dataframe representing this RDD. + Converts this GenomicDataset into a DataFrame. + :return: Returns a dataframe representing this genomic dataset. """ return DataFrame(self._jvmRdd.toDF(), SQLContext(self.sc)) @@ -577,7 +576,7 @@ def toDF(self): class VCFSupportingGenomicDataset(GenomicDataset): """ - Wraps an GenomicDatset with VCF metadata. + Wraps an GenomicDataset with VCF metadata. """ def __init__(self, jvmRdd, sc): @@ -635,7 +634,7 @@ def addFixedArrayFormatHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addFixedArrayFormatHeaderLine(name, @@ -656,7 +655,7 @@ def addScalarFormatHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addScalarFormatHeaderLine(name, @@ -679,7 +678,7 @@ def addGenotypeArrayFormatHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addGenotypeArrayFormatHeaderLine(name, @@ -702,7 +701,7 @@ def addAlternateAlleleArrayFormatHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addAlternateAlleleArrayFormatHeaderLine(name, @@ -726,7 +725,7 @@ def addAllAlleleArrayFormatHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addAllAlleleArrayFormatHeaderLine(name, @@ -748,7 +747,7 @@ def addFixedArrayInfoHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addFixedArrayInfoHeaderLine(name, @@ -769,7 +768,7 @@ def addScalarInfoHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addScalarInfoHeaderLine(name, @@ -792,7 +791,7 @@ def addAlternateAlleleArrayInfoHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addAlternateAlleleArrayInfoHeaderLine(name, @@ -816,7 +815,7 @@ def addAllAlleleArrayInfoHeaderLine(self, field. :param lineType: A Python primitive type corresponding to the type of data stored in the array. Supported types include str, int, float, and chr. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addAllAlleleArrayInfoHeaderLine(name, @@ -832,24 +831,24 @@ def addFilterHeaderLine(self, :param str id: The identifier for the filter. :param str description: A description of the filter. - :return: A new RDD with the new header line added. + :return: A new genomic dataset with the new header line added. """ return self._replaceRdd(self._jvmRdd.addFilterHeaderLine(name, description)) -class AlignmentRecordRDD(GenomicDataset): +class AlignmentRecordDataset(GenomicDataset): """ - Wraps an GenomicDatset with Alignment Record metadata and functions. + Wraps an GenomicDataset with Alignment Record metadata and functions. """ def __init__(self, jvmRdd, sc): """ - Constructs a Python AlignmentRecordRDD from a JVM AlignmentRecordRDD. + Constructs a Python AlignmentRecordDataset from a JVM AlignmentRecordDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM AlignmentRecordRDD. + :param jvmRdd: Py4j handle to the underlying JVM AlignmentRecordDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -858,7 +857,7 @@ def __init__(self, jvmRdd, sc): def _replaceRdd(self, newRdd): - return AlignmentRecordRDD(newRdd, self.sc) + return AlignmentRecordDataset(newRdd, self.sc) def _inferConversionFn(self, destClass): @@ -870,34 +869,34 @@ def toFragments(self): """ Convert this set of reads into fragments. - :return: Returns a FragmentRDD where all reads have been grouped + :return: Returns a FragmentDataset where all reads have been grouped together by the original sequence fragment they come from. - :rtype: bdgenomics.adam.rdd.FragmentRDD + :rtype: bdgenomics.adam.rdd.FragmentDataset """ - return FragmentRDD(self._jvmRdd.toFragments(), self.sc) + return FragmentDataset(self._jvmRdd.toFragments(), self.sc) def toCoverage(self, collapse = True): """ - Converts this set of reads into a corresponding CoverageRDD. + Converts this set of reads into a corresponding CoverageDataset. :param bool collapse: Determines whether to merge adjacent coverage elements with the same score to a single coverage observation. - :return: Returns an RDD with observed coverage. - :rtype: bdgenomics.adam.rdd.CoverageRDD + :return: Returns a genomic dataset with observed coverage. + :rtype: bdgenomics.adam.rdd.CoverageDataset """ - coverageRDD = CoverageRDD(self._jvmRdd.toCoverage(), self.sc) + coverage = CoverageDataset(self._jvmRdd.toCoverage(), self.sc) if (collapse): - return coverageRDD.collapse() + return coverage.collapse() else: - return coverageRDD + return coverage def save(self, filePath, isSorted = False): """ - Saves this RDD to disk, with the type identified by the extension. + Saves this genomic dataset to disk, with the type identified by the extension. :param str filePath: The path to save the file to. :param bool isSorted: Whether the file is sorted or not. @@ -912,7 +911,7 @@ def saveAsSam(self, isSorted=False, asSingleFile=False): """ - Saves this RDD to disk as a SAM/BAM/CRAM file. + Saves this genomic dataset to disk as a SAM/BAM/CRAM file. :param str filePath: The path to save the file to. :param str asType: The type of file to save. Valid choices are SAM, BAM, @@ -935,13 +934,13 @@ def saveAsSam(self, def saveAsSamString(self): """ - Converts an RDD into the SAM spec string it represents. + Converts a genomic dataset into the SAM spec string it represents. - This method converts an RDD of AlignmentRecords back to an RDD of + This method converts an genomic dataset of AlignmentRecords back to an RDD of SAMRecordWritables and a SAMFileHeader, and then maps this RDD into a string on the driver that represents this file in SAM. - :return: A string on the driver representing this RDD of reads in SAM format. + :return: A string on the driver representing this genomic dataset of reads in SAM format. :rtype: str """ @@ -970,10 +969,9 @@ def sortReadsByReferencePosition(self): lexicographically by name. :return: Returns a new RDD containing sorted reads. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ - - return AlignmentRecordRDD(self._jvmRdd.sortReadsByReferencePosition(), + return AlignmentRecordDataset(self._jvmRdd.sortReadsByReferencePosition(), self.sc) @@ -985,11 +983,11 @@ def sortReadsByReferencePositionAndIndex(self): put at the end and sorted by read name. Contigs are ordered by index that they are ordered in the sequence metadata. - :return: Returns a new RDD containing sorted reads. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + :return: Returns a new genomic dataset containing sorted reads. + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ - return AlignmentRecordRDD(self._jvmRdd.sortReadsByReferencePositionAndIndex(), + return AlignmentRecordDataset(self._jvmRdd.sortReadsByReferencePositionAndIndex(), self.sc) @@ -997,12 +995,12 @@ def markDuplicates(self): """ Marks reads as possible fragment duplicates. - :return: A new RDD where reads have the duplicate read flag set. + :return: A new genomic dataset where reads have the duplicate read flag set. Duplicate reads are NOT filtered out. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ - return AlignmentRecordRDD(self._jvmRdd.markDuplicates(), + return AlignmentRecordDataset(self._jvmRdd.markDuplicates(), self.sc) @@ -1013,11 +1011,10 @@ def recalibrateBaseQualities(self, Runs base quality score recalibration on a set of reads. Uses a table of known SNPs to mask true variation during the recalibration process. - :param bdgenomics.adam.rdd.VariantRDD knownSnps: A table of known SNPs to mask valid variants. + :param bdgenomics.adam.rdd.VariantDataset knownSnps: A table of known SNPs to mask valid variants. :param bdgenomics.adam.stringency validationStringency: """ - - return AlignmentRecordRDD(self._jvmRdd.recalibrateBaseQualities(knownSnps._jvmRdd, + return AlignmentRecordDataset(self._jvmRdd.recalibrateBaseQualities(knownSnps._jvmRdd, _toJava(validationStringency, self.sc._jvm))) @@ -1042,12 +1039,12 @@ def realignIndels(self, realignments are only finalized if the log-odds threshold is exceeded. :param int maxTargetSize: The maximum width of a single target region for realignment. - :return: Returns an RDD of mapped reads which have been realigned. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + :return: Returns an genomic dataset of mapped reads which have been realigned. + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ consensusModel = self.sc._jvm.org.bdgenomics.adam.algorithms.consensus.ConsensusGenerator.fromReads() - return AlignmentRecordRDD(self._jvmRdd.realignIndels(consensusModel, + return AlignmentRecordDataset(self._jvmRdd.realignIndels(consensusModel, isSorted, maxIndelSize, maxConsensusNumber, @@ -1068,7 +1065,7 @@ def realignIndels(self, Generates consensuses from prior called INDELs. - :param bdgenomics.adam.rdd.VariantRDD knownIndels: An RDD of previously + :param bdgenomics.adam.rdd.VariantDataset knownIndels: An RDD of previously called INDEL variants. :param bool isSorted: If the input data is sorted, setting this parameter to true avoids a second sort. @@ -1080,12 +1077,12 @@ def realignIndels(self, realignments are only finalized if the log-odds threshold is exceeded. :param int maxTargetSize: The maximum width of a single target region for realignment. - :return: Returns an RDD of mapped reads which have been realigned. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + :return: Returns a genomic dataset of mapped reads which have been realigned. + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ consensusModel = self.sc._jvm.org.bdgenomics.adam.algorithms.consensus.ConsensusGenerator.fromKnowns(knownIndels._jvmRdd) - return AlignmentRecordRDD(self._jvmRdd.realignIndels(consensusModel, + return AlignmentRecordDataset(self._jvmRdd.realignIndels(consensusModel, isSorted, maxIndelSize, maxConsensusNumber, @@ -1124,7 +1121,7 @@ def saveAsPairedFastq(self, false, writes out reads with the base qualities from the qual field. Default is false. :param bdgenomics.adam.stringency validationStringency: If strict, throw - an exception if any read in this RDD is not accompanied by its mate. + an exception if any read in this genomic dataset is not accompanied by its mate. :param pyspark.storagelevel.StorageLevel persistLevel: The persistance level to cache reads at between passes. """ @@ -1145,7 +1142,7 @@ def saveAsFastq(self, :param str fileName: Path to save files at. :param bdgenomics.adam.stringency validationStringency: If strict, throw - an exception if any read in this RDD is not accompanied by its mate. + an exception if any read in this genomic dataset is not accompanied by its mate. :param bool sort: Whether to sort the FASTQ files by read name or not. Defaults to false. Sorting the output will recover pair order, if desired. @@ -1175,32 +1172,32 @@ def reassembleReadPairs(self, from the pairs. :param bdgenomics.adam.stringency validationStringency: How stringently to validate the reads. - :return: Returns an RDD with the pair information recomputed. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + :return: Returns a genomic dataset with the pair information recomputed. + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ - - return AlignmentRecordRDD(self._jvmRdd.reassembleReadPairs(rdd._jrdd, - _toJava(validationStringency, self.sc._jvm)), - self.sc) + return AlignmentRecordDataset(self._jvmRdd.reassembleReadPairs(rdd._jrdd, + _toJava(validationStringency, self.sc._jvm)), + self.sc) -class CoverageRDD(GenomicDataset): +class CoverageDataset(GenomicDataset): """ - Wraps an GenomicDatset with Coverage metadata and functions. + Wraps an GenomicDataset with Coverage metadata and functions. """ + def _replaceRdd(self, newRdd): - return CoverageRDD(newRdd, self.sc) + return CoverageDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python CoverageRDD from a JVM CoverageRDD. + Constructs a Python CoverageDataset from a JVM CoverageDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM CoverageRDD. + :param jvmRdd: Py4j handle to the underlying JVM CoverageDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1224,26 +1221,26 @@ def collapse(self): Merges adjacent ReferenceRegions with the same coverage value. This reduces the loss of coverage information while reducing the number - of records in the RDD. For example, adjacent records Coverage("chr1", 1, + of records in the genomic dataset. For example, adjacent records Coverage("chr1", 1, 10, 3.0) and Coverage("chr1", 10, 20, 3.0) would be merged into one record Coverage("chr1", 1, 20, 3.0). - :return: An RDD with merged tuples of adjacent sites with same coverage. - :rtype: bdgenomics.adam.rdd.CoverageRDD + :return: A genomic dataset with merged tuples of adjacent sites with same coverage. + :rtype: bdgenomics.adam.rdd.CoverageDataset """ - return CoverageRDD(self._jvmRdd.collapse(), self.sc) + return CoverageDataset(self._jvmRdd.collapse(), self.sc) def toFeatures(self): """ - Converts CoverageRDD to FeatureRDD. + Converts CoverageDataset to FeatureDataset. - :return: Returns a FeatureRDD from CoverageRDD. - :rtype: bdgenomics.adam.rdd.FeatureRDD + :return: Returns a FeatureDataset from CoverageDataset. + :rtype: bdgenomics.adam.rdd.FeatureDataset """ - return FeatureRDD(self._jvmRdd.toFeatures(), self.sc) + return FeatureDataset(self._jvmRdd.toFeatures(), self.sc) def coverage(self, bpPerBin = 1): @@ -1255,11 +1252,11 @@ def coverage(self, bpPerBin = 1): of each bin is the coverage of the first base pair in that bin. :param int bpPerBin: Number of bases to combine to one bin. - :return: Returns a sparsified CoverageRDD. - :rtype: bdgenomics.adam.rdd.CoverageRDD + :return: Returns a sparsified CoverageDataset. + :rtype: bdgenomics.adam.rdd.CoverageDataset """ - return CoverageRDD(self._jvmRdd.coverage(bpPerBin), self.sc) + return CoverageDataset(self._jvmRdd.coverage(bpPerBin), self.sc) def aggregatedCoverage(self, bpPerBin = 1): @@ -1271,24 +1268,24 @@ def aggregatedCoverage(self, bpPerBin = 1): of each bin is the average coverage of the bases in that bin. :param int bpPerBin: Number of bases to combine to one bin. - :return: Returns a sparsified CoverageRDD. - :rtype: bdgenomics.adam.rdd.CoverageRDD + :return: Returns a sparsified CoverageDataset. + :rtype: bdgenomics.adam.rdd.CoverageDataset """ - return CoverageRDD(self._jvmRdd.aggregatedCoverage(bpPerBin), self.sc) + return CoverageDataset(self._jvmRdd.aggregatedCoverage(bpPerBin), self.sc) def flatten(self): """ - Gets flattened RDD of coverage, with coverage mapped to each base pair. + Gets flattened genomic dataset of coverage, with coverage mapped to each base pair. The opposite operation of collapse. - :return: New CoverageRDD of flattened coverage. - :rtype: bdgenomics.adam.rdd.CoverageRDD + :return: New CoverageDataset of flattened coverage. + :rtype: bdgenomics.adam.rdd.CoverageDataset """ - return CoverageRDD(self._jvmRdd.flatten(), self.sc) + return CoverageDataset(self._jvmRdd.flatten(), self.sc) def _inferConversionFn(self, destClass): @@ -1296,23 +1293,23 @@ def _inferConversionFn(self, destClass): return "org.bdgenomics.adam.api.java.CoverageTo%s" % self._destClassSuffix(destClass) -class FeatureRDD(GenomicDataset): +class FeatureDataset(GenomicDataset): """ - Wraps an GenomicDatset with Feature metadata and functions. + Wraps an GenomicDataset with Feature metadata and functions. """ def _replaceRdd(self, newRdd): - return FeatureRDD(newRdd, self.sc) + return FeatureDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python FeatureRDD from a JVM FeatureRDD. + Constructs a Python FeatureDataset from a JVM FeatureDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM FeatureRDD. + :param jvmRdd: Py4j handle to the underlying JVM FeatureDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1341,13 +1338,13 @@ def save(self, filePath, asSingleFile = False, disableFastConcat = False): def toCoverage(self): """ - Converts the FeatureRDD to a CoverageRDD. + Converts the FeatureDataset to a CoverageDataset. - :return: Returns a new CoverageRDD. - :rtype: bdgenomics.adam.rdd.CoverageRDD. + :return: Returns a new CoverageDataset. + :rtype: bdgenomics.adam.rdd.CoverageDataset. """ - return CoverageRDD(self._jvmRdd.toCoverage(), self.sc) + return CoverageDataset(self._jvmRdd.toCoverage(), self.sc) def _inferConversionFn(self, destClass): @@ -1355,23 +1352,23 @@ def _inferConversionFn(self, destClass): return "org.bdgenomics.adam.api.java.FeaturesTo%s" % self._destClassSuffix(destClass) -class FragmentRDD(GenomicDataset): +class FragmentDataset(GenomicDataset): """ - Wraps an GenomicDatset with Fragment metadata and functions. + Wraps an GenomicDataset with Fragment metadata and functions. """ def _replaceRdd(self, newRdd): - return FragmentRDD(newRdd, self.sc) + return FragmentDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python FragmentRDD from a JVM FragmentRDD. + Constructs a Python FragmentDataset from a JVM FragmentDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM FragmentRDD. + :param jvmRdd: Py4j handle to the underlying JVM FragmentDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1380,25 +1377,25 @@ def __init__(self, jvmRdd, sc): def toReads(self): """ - Splits up the reads in a Fragment, and creates a new RDD. - - :return: Returns this RDD converted back to reads. - :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD + Splits up the reads in a Fragment, and creates a new genomic dataset. + + :return: Returns this genomic dataset converted back to reads. + :rtype: bdgenomics.adam.rdd.AlignmentRecordDataset """ - return AlignmentRecordRDD(self._jvmRdd.toReads(), self.sc) + return AlignmentRecordDataset(self._jvmRdd.toReads(), self.sc) def markDuplicates(self): """ Marks reads as possible fragment duplicates. - :return: A new RDD where reads have the duplicate read flag set. + :return: A new genomic dataset where reads have the duplicate read flag set. Duplicate reads are NOT filtered out. - :rtype: bdgenomics.adam.rdd.FragmentRDD + :rtype: bdgenomics.adam.rdd.FragmentDataset """ - return FragmentRDD(self._jvmRdd.markDuplicates(), self.sc) + return FragmentDataset(self._jvmRdd.markDuplicates(), self.sc) def save(self, filePath): @@ -1416,23 +1413,23 @@ def _inferConversionFn(self, destClass): return "org.bdgenomics.adam.api.java.FragmentsTo%s" % self._destClassSuffix(destClass) -class GenotypeRDD(VCFSupportingGenomicDataset): +class GenotypeDataset(VCFSupportingGenomicDataset): """ - Wraps an GenomicDatset with Genotype metadata and functions. + Wraps an GenomicDataset with Genotype metadata and functions. """ def _replaceRdd(self, newRdd): - return GenotypeRDD(newRdd, self.sc) + return GenotypeDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python GenotypeRDD from a JVM GenotypeRDD. + Constructs a Python GenotypeDataset from a JVM GenotypeDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM GenotypeRDD. + :param jvmRdd: Py4j handle to the underlying JVM GenotypeDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1441,7 +1438,7 @@ def __init__(self, jvmRdd, sc): def saveAsParquet(self, filePath): """ - Saves this RDD of genotypes to disk as Parquet. + Saves this genomic dataset of genotypes to disk as Parquet. :param str filePath: Path to save file to. """ @@ -1455,22 +1452,21 @@ def toVariantContexts(self): """ vcs = self._jvmRdd.toVariantContexts() - return VariantContextRDD(vcs, self.sc) + return VariantContextDataset(vcs, self.sc) def toVariants(self, dedupe=False): """ - Extracts the variants contained in this RDD of genotypes. + Extracts the variants contained in this genomic dataset of genotypes. Does not perform any filtering looking at whether the variant was called or not. By default, does not deduplicate variants. :param bool dedupe: If true, drops variants described in more than one genotype record. - :return: Returns the variants described by this GenotypeRDD. + :return: Returns the variants described by this GenotypeDataset. """ - - return VariantRDD(self._jvmRdd.toVariants(dedupe), self.sc) + return VariantDataset(self._jvmRdd.toVariants(dedupe), self.sc) def _inferConversionFn(self, destClass): @@ -1478,23 +1474,23 @@ def _inferConversionFn(self, destClass): return "org.bdgenomics.adam.api.java.GenotypesTo%s" % self._destClassSuffix(destClass) -class NucleotideContigFragmentRDD(GenomicDataset): +class NucleotideContigFragmentDataset(GenomicDataset): """ - Wraps an GenomicDatset with Nucleotide Contig Fragment metadata and functions. + Wraps an GenomicDataset with Nucleotide Contig Fragment metadata and functions. """ def _replaceRdd(self, newRdd): - return NucleotideContigFragmentRDD(newRdd, self.sc) + return NucleotideContigFragmentDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python NucleotideContigFragmentRDD from a JVM - NucleotideContigFragmentRDD. Should not be called from user code; + Constructs a Python NucleotideContigFragmentDataset from a JVM + NucleotideContigFragmentDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM NucleotideContigFragmentRDD. + :param jvmRdd: Py4j handle to the underlying JVM NucleotideContigFragmentDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1517,18 +1513,17 @@ def save(self, fileName): def flankAdjacentFragments(self, flankLength): """ - For all adjacent records in the RDD, we extend the records so that the + For all adjacent records in the genomic dataset, we extend the records so that the adjacent records now overlap by _n_ bases, where _n_ is the flank length. :param int flankLength: The length to extend adjacent records by. - :return: Returns the RDD, with all adjacent fragments extended with + :return: Returns the genomic dataset, with all adjacent fragments extended with flanking sequence. - :rtype: bdgenomics.adam.rdd.NucleotideContigFragmentRDD + :rtype: bdgenomics.adam.rdd.NucleotideContigFragmentDataset """ - return NucleotideContigFragmentRDD(self._jvmRdd.flankAdjacentFragments(flankLength), - self.sc) + return NucleotideContigFragmentDataset(self._jvmRdd.flankAdjacentFragments(flankLength), self.sc) def countKmers(self, kmerLength): @@ -1548,23 +1543,23 @@ def _inferConversionFn(self, destClass): return "org.bdgenomics.adam.api.java.ContigsTo%s" % self._destClassSuffix(destClass) -class VariantRDD(VCFSupportingGenomicDataset): +class VariantDataset(VCFSupportingGenomicDataset): """ - Wraps an GenomicDatset with Variant metadata and functions. + Wraps an GenomicDataset with Variant metadata and functions. """ def _replaceRdd(self, newRdd): - return VariantRDD(newRdd, self.sc) + return VariantDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python VariantRDD from a JVM VariantRDD. + Constructs a Python VariantDataset from a JVM VariantDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM VariantRDD. + :param jvmRdd: Py4j handle to the underlying JVM VariantDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1577,12 +1572,12 @@ def toVariantContexts(self): """ vcs = self._jvmRdd.toVariantContexts() - return VariantContextRDD(vcs, self.sc) + return VariantContextDataset(vcs, self.sc) def saveAsParquet(self, filePath): """ - Saves this RDD of variants to disk as Parquet. + Saves this genomic dataset of variants to disk as Parquet. :param str filePath: Path to save file to. """ @@ -1594,24 +1589,24 @@ def _inferConversionFn(self, destClass): return "org.bdgenomics.adam.api.java.VariantsTo%s" % self._destClassSuffix(destClass) - -class VariantContextRDD(VCFSupportingGenomicDataset): + +class VariantContextDataset(VCFSupportingGenomicDataset): """ Wraps an GenomicDataset with Variant Context metadata and functions. """ def _replaceRdd(self, newRdd): - return VariantContextRDD(newRdd, self.sc) + return VariantContextDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python VariantContextRDD from a JVM VariantContextRDD. + Constructs a Python VariantContextDataset from a JVM VariantContextDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM VariantContextRDD. + :param jvmRdd: Py4j handle to the underlying JVM VariantContextDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1625,7 +1620,7 @@ def saveAsVcf(self, stringency=LENIENT, disableFastConcat=False): """ - Saves this RDD of variants to disk as VCF. + Saves this genomic dataset of variants to disk as VCF. :param str filePath: Path to save file to. :param bool asSingleFile: If true, saves the output as a single file diff --git a/adam-python/bdgenomics/adam/test/alignmentRecordRdd_test.py b/adam-python/bdgenomics/adam/test/alignmentRecordDataset_test.py similarity index 97% rename from adam-python/bdgenomics/adam/test/alignmentRecordRdd_test.py rename to adam-python/bdgenomics/adam/test/alignmentRecordDataset_test.py index 3e3cffb4d7..9b78225435 100644 --- a/adam-python/bdgenomics/adam/test/alignmentRecordRdd_test.py +++ b/adam-python/bdgenomics/adam/test/alignmentRecordDataset_test.py @@ -19,13 +19,13 @@ from bdgenomics.adam.adamContext import ADAMContext from bdgenomics.adam.models import ReferenceRegion -from bdgenomics.adam.rdd import AlignmentRecordRDD, CoverageRDD +from bdgenomics.adam.rdd import AlignmentRecordDataset, CoverageDataset from bdgenomics.adam.test import SparkTestCase from pyspark.sql.types import DoubleType from pyspark.storagelevel import StorageLevel -class AlignmentRecordRDDTest(SparkTestCase): +class AlignmentRecordDatasetTest(SparkTestCase): def test_save_sorted_sam(self): @@ -137,10 +137,10 @@ def test_transmute_to_coverage(self): x.end, x.mapq.cast(DoubleType()).alias("count"), x.recordGroupSample.alias("optSampleId")), - CoverageRDD) + CoverageDataset) - assert(isinstance(readsAsCoverage, CoverageRDD)) - self.assertEqual(readsAsCoverage.toDF().count(), 5) + assert(isinstance(readsAsCoverage, CoverageDataset)) + self.assertEquals(readsAsCoverage.toDF().count(), 5) def test_to_coverage(self): diff --git a/adam-python/bdgenomics/adam/test/coverageRdd_test.py b/adam-python/bdgenomics/adam/test/coverageDataset_test.py similarity index 91% rename from adam-python/bdgenomics/adam/test/coverageRdd_test.py rename to adam-python/bdgenomics/adam/test/coverageDataset_test.py index 27ec474911..aa0be7148d 100644 --- a/adam-python/bdgenomics/adam/test/coverageRdd_test.py +++ b/adam-python/bdgenomics/adam/test/coverageDataset_test.py @@ -18,11 +18,11 @@ from bdgenomics.adam.adamContext import ADAMContext -from bdgenomics.adam.rdd import CoverageRDD, FeatureRDD +from bdgenomics.adam.rdd import CoverageDataset, FeatureDataset from bdgenomics.adam.test import SparkTestCase import os -class CoverageRDDTest(SparkTestCase): +class CoverageDatasetTest(SparkTestCase): def test_save(self): @@ -55,8 +55,8 @@ def test_toFeatures(self): coverage = reads.toCoverage() features = coverage.toFeatures() - assert(isinstance(features, FeatureRDD)) - self.assertEqual(features.toDF().count(), coverage.toDF().count()) + assert(isinstance(features, FeatureDataset)) + self.assertEquals(features.toDF().count(), coverage.toDF().count()) def test_aggregatedCoverage(self): testFile = self.resourceFile("small.sam") diff --git a/adam-python/bdgenomics/adam/test/featureRdd_test.py b/adam-python/bdgenomics/adam/test/featureDataset_test.py similarity index 98% rename from adam-python/bdgenomics/adam/test/featureRdd_test.py rename to adam-python/bdgenomics/adam/test/featureDataset_test.py index f660547d4b..75698a600c 100644 --- a/adam-python/bdgenomics/adam/test/featureRdd_test.py +++ b/adam-python/bdgenomics/adam/test/featureDataset_test.py @@ -21,7 +21,7 @@ from bdgenomics.adam.test import SparkTestCase -class FeatureRDDTest(SparkTestCase): +class FeatureDatasetTest(SparkTestCase): def test_round_trip_gtf(self): diff --git a/adam-python/bdgenomics/adam/test/genotypeRdd_test.py b/adam-python/bdgenomics/adam/test/genotypeDataset_test.py similarity index 99% rename from adam-python/bdgenomics/adam/test/genotypeRdd_test.py rename to adam-python/bdgenomics/adam/test/genotypeDataset_test.py index 789d198ce2..88a6b2bf66 100644 --- a/adam-python/bdgenomics/adam/test/genotypeRdd_test.py +++ b/adam-python/bdgenomics/adam/test/genotypeDataset_test.py @@ -21,7 +21,7 @@ from bdgenomics.adam.test import SparkTestCase -class GenotypeRDDTest(SparkTestCase): +class GenotypeDatasetTest(SparkTestCase): def check_for_line_in_file(self, path, line): diff --git a/adam-python/bdgenomics/adam/test/variantRdd_test.py b/adam-python/bdgenomics/adam/test/variantDataset_test.py similarity index 97% rename from adam-python/bdgenomics/adam/test/variantRdd_test.py rename to adam-python/bdgenomics/adam/test/variantDataset_test.py index 467aa3eebc..1e11d6db4d 100644 --- a/adam-python/bdgenomics/adam/test/variantRdd_test.py +++ b/adam-python/bdgenomics/adam/test/variantDataset_test.py @@ -21,7 +21,7 @@ from bdgenomics.adam.test import SparkTestCase -class VariantRDDTest(SparkTestCase): +class VariantDatasetTest(SparkTestCase): def test_vcf_round_trip(self): diff --git a/adam-r/bdgenomics.adam/R/adam-context.R b/adam-r/bdgenomics.adam/R/adam-context.R index 4cf59ef93f..b9640be486 100644 --- a/adam-r/bdgenomics.adam/R/adam-context.R +++ b/adam-r/bdgenomics.adam/R/adam-context.R @@ -66,7 +66,7 @@ javaStringency <- function(stringency) { } -#' Load alignment records into an AlignmentRecordRDD. +#' Load alignment records into an AlignmentRecordDataset. #' #' Loads path names ending in: #' * .bam/.cram/.sam as BAM/CRAM/SAM format, @@ -83,7 +83,7 @@ javaStringency <- function(stringency) { #' @param ac The ADAMContext. #' @param filePath The path to load the file from. #' @param stringency The validation stringency to apply. Defaults to STRICT. -#' @return Returns an RDD containing reads. +#' @return Returns a genomic dataset containing reads. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -95,10 +95,10 @@ setMethod("loadAlignments", jrdd <- sparkR.callJMethod(ac@jac, "loadAlignments", filePath, jStringency) - AlignmentRecordRDD(jrdd) + AlignmentRecordDataset(jrdd) }) -#' Load nucleotide contig fragments into a NucleotideContigFragmentRDD. +#' Load nucleotide contig fragments into a NucleotideContigFragmentDataset. #' #' If the path name has a .fa/.fasta extension, load as FASTA format. #' Else, fall back to Parquet + Avro. @@ -108,7 +108,7 @@ setMethod("loadAlignments", #' #' @param ac The ADAMContext. #' @param filePath The path to load the file from. -#' @return Returns an RDD containing sequence fragments. +#' @return Returns a genomic dataset containing nucleotide contig fragments. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -117,10 +117,10 @@ setMethod("loadContigFragments", signature(ac = "ADAMContext", filePath = "character"), function(ac, filePath) { jrdd <- sparkR.callJMethod(ac@jac, "loadContigFragments", filePath) - NucleotideContigFragmentRDD(jrdd) + NucleotideContigFragmentDataset(jrdd) }) -#' Load fragments into a FragmentRDD. +#' Load fragments into a FragmentDataset. #' #' Loads path names ending in: #' * .bam/.cram/.sam as BAM/CRAM/SAM format and @@ -134,7 +134,7 @@ setMethod("loadContigFragments", #' @param ac The ADAMContext. #' @param filePath The path to load the file from. #' @param stringency The validation stringency to apply. Defaults to STRICT. -#' @return Returns an RDD containing sequence fragments. +#' @return Returns a genomic dataset containing sequence fragments. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -147,10 +147,10 @@ setMethod("loadFragments", "loadFragments", filePath, jStringency) - FragmentRDD(jrdd) + FragmentDataset(jrdd) }) -#' Load features into a FeatureRDD. +#' Load features into a FeatureDataset. #' #' Loads path names ending in: #' * .bed as BED6/12 format, @@ -168,7 +168,7 @@ setMethod("loadFragments", #' @param ac The ADAMContext. #' @param filePath The path to load the file from. #' @param stringency The validation stringency to apply. Defaults to STRICT. -#' @return Returns an RDD containing features. +#' @return Returns a genomic dataset containing features. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -181,10 +181,10 @@ setMethod("loadFeatures", "loadFeatures", filePath, jStringency) - FeatureRDD(jrdd) + FeatureDataset(jrdd) }) -#' Load features into a FeatureRDD and convert to a CoverageRDD. +#' Load features into a FeatureDataset and convert to a CoverageDataset. #' Coverage is stored in the score field of Feature. #' #' Loads path names ending in: @@ -203,7 +203,7 @@ setMethod("loadFeatures", #' @param ac The ADAMContext. #' @param filePath The path to load the file from. #' @param stringency The validation stringency to apply. Defaults to STRICT. -#' @return Returns an RDD containing coverage. +#' @return Returns a genomic dataset containing coverage. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -216,10 +216,10 @@ setMethod("loadCoverage", "loadCoverage", filePath, jStringency) - CoverageRDD(jrdd) + CoverageDataset(jrdd) }) -#' Load genotypes into a GenotypeRDD. +#' Load genotypes into a GenotypeDataset. #' #' If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. #' Else, fall back to Parquet + Avro. @@ -227,7 +227,7 @@ setMethod("loadCoverage", #' @param ac The ADAMContext. #' @param filePath The path to load the file from. #' @param stringency The validation stringency to apply. Defaults to STRICT. -#' @return Returns an RDD containing genotypes. +#' @return Returns a genomic dataset containing genotypes. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -240,10 +240,10 @@ setMethod("loadGenotypes", "loadGenotypes", filePath, jStringency) - GenotypeRDD(jrdd) + GenotypeDataset(jrdd) }) -#' Load variants into a VariantRDD. +#' Load variants into a VariantDataset. #' #' If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. #' Else, fall back to Parquet + Avro. @@ -251,7 +251,7 @@ setMethod("loadGenotypes", #' @param ac The ADAMContext. #' @param filePath The path to load the file from. #' @param stringency The validation stringency to apply. Defaults to STRICT. -#' @return Returns an RDD containing variants. +#' @return Returns a genomic dataset containing variants. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -264,5 +264,5 @@ setMethod("loadVariants", "loadVariants", filePath, jStringency) - VariantRDD(jrdd) + VariantDataset(jrdd) }) diff --git a/adam-r/bdgenomics.adam/R/generics.R b/adam-r/bdgenomics.adam/R/generics.R index 070934e634..2b61383820 100644 --- a/adam-r/bdgenomics.adam/R/generics.R +++ b/adam-r/bdgenomics.adam/R/generics.R @@ -61,7 +61,7 @@ setGeneric("loadGenotypes", setGeneric("loadVariants", function(ac, filePath, ...) { standardGeneric("loadVariants") }) -#### RDD operations #### +#### Genomic dataset operations #### #' The GenomicDataset is the base class that all genomic datatypes extend from in ADAM. #' @@ -75,7 +75,7 @@ NULL #' @param convFn The name of the ADAM GenomicDataset conversion class to #' use. #' @param ... additional argument(s). -#' @return Returns a new RDD where the input from the original RDD has +#' @return Returns a new genomic dataset where the input from the original genomic dataset has #' been piped through a command that runs locally on each executor. #' @export setGeneric("pipe", @@ -104,7 +104,7 @@ setGeneric("destClassSuffix", function(destClass) { standardGeneric("destClassSuffix") }) #' @rdname GenomicDataset -#' @param tFn A function that transforms the underlying RDD as a DataFrame. +#' @param tFn A function that transforms the underlying DataFrame as a DataFrame. #' @param destClass The destination class of this transmutation. #' @export setGeneric("transmute", @@ -125,9 +125,9 @@ setGeneric("sort", setGeneric("sortLexicographically", function(ardd) { standardGeneric("sortLexicographically") }) -#' Saves this RDD to disk as Parquet. +#' Saves this genomic dataset to disk as Parquet. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath Path to save file to. #' #' @rdname GenomicDataset @@ -256,51 +256,51 @@ setGeneric("shuffleRegionJoinAndGroupByLeft", #### AlignmentRecord operations #### -#' The AlignmentRecordRDD is the class used to manipulate genomic read data. +#' The AlignmentRecordDataset is the class used to manipulate genomic read data. #' -#' @name AlignmentRecordRDD +#' @name AlignmentRecordDataset NULL -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @export setGeneric("toFragments", function(ardd) { standardGeneric("toFragments") }) -#' @rdname AlignmentRecordRDD -#' @param ardd The RDD to apply this to. +#' @rdname AlignmentRecordDataset +#' @param ardd The genomic dataset to apply this to. #' @param ... additional argument(s). #' @export setGeneric("toCoverage", function(ardd, ...) { standardGeneric("toCoverage") }) -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @param kmerLength The value of _k_ to use for cutting _k_-mers. #' @export setGeneric("countKmers", function(ardd, kmerLength) { standardGeneric("countKmers") }) -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @param filePath The path to save the file to. #' @export setGeneric("saveAsSam", function(ardd, filePath, ...) { standardGeneric("saveAsSam") }) -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @export setGeneric("sortReadsByReferencePosition", function(ardd) { standardGeneric("sortReadsByReferencePosition") }) -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @export setGeneric("sortReadsByReferencePositionAndIndex", function(ardd) { standardGeneric("sortReadsByReferencePositionAndIndex") }) -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @export setGeneric("markDuplicates", function(ardd) { standardGeneric("markDuplicates") }) -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @param knownSnps A table of known SNPs to mask valid variants. #' @param validationStringency The stringency to apply towards validating BQSR. #' @export @@ -309,86 +309,86 @@ setGeneric("recalibrateBaseQualities", standardGeneric("recalibrateBaseQualities") }) -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @export setGeneric("realignIndels", function(ardd, ...) { standardGeneric("realignIndels") }) #### Coverage operations #### -#' The CoverageRDD class is used to manipulate read coverage counts. +#' The CoverageDataset class is used to manipulate read coverage counts. #' -#' @name CoverageRDD +#' @name CoverageDataset NULL -#' @rdname CoverageRDD +#' @rdname CoverageDataset #' @param ... additional argument(s). #' @export setGeneric("collapse", function(ardd, ...) { standardGeneric("collapse") }) -#' @rdname CoverageRDD +#' @rdname CoverageDataset #' @export setGeneric("toFeatures", function(ardd) { standardGeneric("toFeatures") }) -#' @rdname CoverageRDD +#' @rdname CoverageDataset #' @export setGeneric("coverage", function(ardd, ...) { standardGeneric("coverage") }) -#' @rdname CoverageRDD +#' @rdname CoverageDataset #' @export -#' @aliases aggregatedCoverage,CoverageRDD-method +#' @aliases aggregatedCoverage,CoverageDataset-method setGeneric("aggregatedCoverage", function(ardd, ...) { standardGeneric("aggregatedCoverage") }) -#' @rdname CoverageRDD +#' @rdname CoverageDataset #' @export setGeneric("flatten", function(ardd) { standardGeneric("flatten") }) #### Fragment operations #### -#' The FragmentRDD class is used to manipulate paired reads. +#' The FragmentDataset class is used to manipulate paired reads. #' -#' @name FragmentRDD +#' @name FragmentDataset NULL -#' @rdname FragmentRDD -#' @param ardd The RDD to apply this to. +#' @rdname FragmentDataset +#' @param ardd The genomic dataset to apply this to. #' @export setGeneric("toReads", function(ardd) { standardGeneric("toReads") }) #### Genotype and Variant operations #### -#' Converts this RDD to VariantContexts. +#' Converts this genomic dataset to VariantContexts. #' -#' @param ardd The RDD to apply this to. -#' @return Returns this RDD of Variants as VariantContexts. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns this genomic dataset of Variants as VariantContexts. #' @export setGeneric("toVariantContexts", function(ardd) { standardGeneric("toVariantContexts") }) -#' Converts this RDD to Variants. +#' Converts this genomic dataset to Variants. #' -#' @param ardd The RDD to apply this to. -#' @return Returns this RDD of Genotypes as Variants. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns this genomic dataset of Genotypes as Variants. #' @export setGeneric("toVariants", function(ardd, ...) { standardGeneric("toVariants") }) #### NucleotideContigFragment operations #### -#' The NucleotideContigFragmentRDD class is used to manipulate contigs. +#' The NucleotideContigFragmentDataset class is used to manipulate contigs. #' -#' @name NucleotideContigFragmentRDD +#' @name NucleotideContigFragmentDataset NULL -#' @rdname NucleotideContigFragmentRDD -#' @param ardd The RDD to apply this to. +#' @rdname NucleotideContigFragmentDataset +#' @param ardd The genomic dataset to apply this to. #' @param flankLength The length to extend adjacent records by. #' @export setGeneric("flankAdjacentFragments", @@ -398,18 +398,18 @@ setGeneric("flankAdjacentFragments", #### Variant operations #### -#' The VariantContextRDD class is used to manipulate VCF-styled data. +#' The VariantContextDataset class is used to manipulate VCF-styled data. #' -#' Each element in a VariantContext RDD corresponds to a VCF line. This -#' differs from the GenotypeRDD, where each element represents the genotype -#' of a single sample at a single site, or a VariantRDD, which represents +#' Each element in a VariantContext genomic dataset corresponds to a VCF line. This +#' differs from the GenotypeDataset, where each element represents the genotype +#' of a single sample at a single site, or a VariantDataset, which represents #' just the variant of interest. #' -#' @name VariantContextRDD +#' @name VariantContextDataset NULL -#' @rdname VariantContextRDD -#' @param ardd The RDD to apply this to. +#' @rdname VariantContextDataset +#' @param ardd The genomic dataset to apply this to. #' @param filePath Path to save VCF to. #' @param ... additional argument(s). #' @export diff --git a/adam-r/bdgenomics.adam/R/rdd.R b/adam-r/bdgenomics.adam/R/rdd.R index 78df74e6a1..f5a550b50f 100644 --- a/adam-r/bdgenomics.adam/R/rdd.R +++ b/adam-r/bdgenomics.adam/R/rdd.R @@ -30,11 +30,11 @@ setClass("GenomicDataset", #' A class that wraps an RDD of genomic reads with helpful metadata. #' -#' @rdname AlignmentRecordRDD +#' @rdname AlignmentRecordDataset #' @slot jrdd The Java RDD of AlignmentRecords that this class wraps. #' #' @export -setClass("AlignmentRecordRDD", +setClass("AlignmentRecordDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") @@ -44,112 +44,112 @@ GenomicDataset <- function(jrdd) { } #' @importFrom methods new -AlignmentRecordRDD <- function(jrdd) { - new("AlignmentRecordRDD", jrdd = jrdd) +AlignmentRecordDataset <- function(jrdd) { + new("AlignmentRecordDataset", jrdd = jrdd) } #' A class that wraps an RDD of genomic coverage data with helpful metadata. #' -#' @rdname CoverageRDD +#' @rdname CoverageDataset #' @slot jrdd The Java RDD of Coverage that this class wraps. #' #' @export -setClass("CoverageRDD", +setClass("CoverageDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") #' @importFrom methods new -CoverageRDD <- function(jrdd) { - new("CoverageRDD", jrdd = jrdd) +CoverageDataset <- function(jrdd) { + new("CoverageDataset", jrdd = jrdd) } #' A class that wraps an RDD of genomic features with helpful metadata. #' -#' @rdname FeatureRDD +#' @rdname FeatureDataset #' @slot jrdd The Java RDD of Features that this class wraps. #' #' @export -setClass("FeatureRDD", +setClass("FeatureDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") #' @importFrom methods new -FeatureRDD <- function(jrdd) { - new("FeatureRDD", jrdd = jrdd) +FeatureDataset <- function(jrdd) { + new("FeatureDataset", jrdd = jrdd) } #' A class that wraps an RDD of read pairs grouped by sequencing fragment with helpful metadata. #' -#' @rdname FragmentRDD +#' @rdname FragmentDataset #' @slot jrdd The Java RDD of Fragments that this class wraps. #' #' @export -setClass("FragmentRDD", +setClass("FragmentDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") -FragmentRDD <- function(jrdd) { - new("FragmentRDD", jrdd = jrdd) +FragmentDataset <- function(jrdd) { + new("FragmentDataset", jrdd = jrdd) } #' A class that wraps an RDD of genotypes with helpful metadata. #' -#' @rdname GenotypeRDD +#' @rdname GenotypeDataset #' @slot jrdd The Java RDD of Genotypes that this class wraps. #' #' @export -setClass("GenotypeRDD", +setClass("GenotypeDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") #' @importFrom methods new -GenotypeRDD <- function(jrdd) { - new("GenotypeRDD", jrdd = jrdd) +GenotypeDataset <- function(jrdd) { + new("GenotypeDataset", jrdd = jrdd) } #' A class that wraps an RDD of contigs with helpful metadata. #' -#' @rdname NucleotideContigFragmentRDD +#' @rdname NucleotideContigFragmentDataset #' @slot jrdd The Java RDD of contigs that this class wraps. #' #' @export -setClass("NucleotideContigFragmentRDD", +setClass("NucleotideContigFragmentDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") #' @importFrom methods new -NucleotideContigFragmentRDD <- function(jrdd) { - new("NucleotideContigFragmentRDD", jrdd = jrdd) +NucleotideContigFragmentDataset <- function(jrdd) { + new("NucleotideContigFragmentDataset", jrdd = jrdd) } #' A class that wraps an RDD of variants with helpful metadata. #' -#' @rdname VariantRDD +#' @rdname VariantDataset #' @slot jrdd The Java RDD of Variants that this class wraps. #' #' @export -setClass("VariantRDD", +setClass("VariantDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") #' @importFrom methods new -VariantRDD <- function(jrdd) { - new("VariantRDD", jrdd = jrdd) +VariantDataset <- function(jrdd) { + new("VariantDataset", jrdd = jrdd) } #' A class that wraps an RDD of both variants and genotypes with helpful metadata. #' -#' @rdname VariantContextRDD +#' @rdname VariantContextDataset #' @slot jrdd The Java RDD of VariantContexts that this class wraps. #' #' @export -setClass("VariantContextRDD", +setClass("VariantContextDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") #' @importFrom methods new -VariantContextRDD <- function(jrdd) { - new("VariantContextRDD", jrdd = jrdd) +VariantContextDataset <- function(jrdd) { + new("VariantContextDataset", jrdd = jrdd) } #' @@ -161,16 +161,16 @@ VariantContextRDD <- function(jrdd) { #' #' Pipes require the presence of an InFormatterCompanion and an OutFormatter #' as implicit values. The InFormatterCompanion should be a singleton whose -#' apply method builds an InFormatter given a specific type of GenomicRDD. +#' apply method builds an InFormatter given a specific type of GenomicDataset. #' The implicit InFormatterCompanion yields an InFormatter which is used to #' format the input to the pipe, and the implicit OutFormatter is used to #' parse the output from the pipe. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param cmd The command to run. #' @param tFormatter The name of the ADAM in-formatter class to use. #' @param xFormatter The name of the ADAM out-formatter class to use. -#' @param convFn The name of the ADAM GenomicRDD conversion class to +#' @param convFn The name of the ADAM GenomicDataset conversion class to #' use. #' @param files The files to copy locally onto all executors. Set to #' None (default) to omit. @@ -178,7 +178,7 @@ VariantContextRDD <- function(jrdd) { #' executor. Set to None (default) to omit. #' @param flankSize The number of bases of flanking sequence to have #' around each partition. Defaults to 0. -#' @return Returns a new RDD where the input from the original RDD has +#' @return Returns a new genomic dataset where the input from the original genomic dataset has #' been piped through a command that runs locally on each executor. #' #' @importFrom SparkR sparkR.callJStatic sparkR.callJMethod @@ -237,8 +237,8 @@ setMethod("pipe", #' Caches the existing ardd #' -#' @param ardd The RDD to apply this to. -#' @return A new RDD where the RDD of genomic data has been replaced, but the +#' @param ardd The genomic dataset to apply this to. +#' @return A new genomic dataset where the genomic dataset of genomic data has been replaced, but the #' metadata (sequence dictionary, and etc) is copied without modification. #' #' @importFrom SparkR sparkR.callJMethod @@ -252,9 +252,9 @@ setMethod("cache", #' Persists the existing ardd #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param sl the StorageLevel to persist in. -#' @return A new RDD where the RDD of genomic data has been replaced, but the +#' @return A new genomic dataset where the genomic dataset of genomic data has been replaced, but the #' metadata (sequence dictionary, and etc) is copied without modification. #' #' @importFrom SparkR sparkR.callJMethod sparkR.callJStatic @@ -270,8 +270,8 @@ setMethod("persist", #' Unpersists the existing ardd #' -#' @param ardd The RDD to apply this to. -#' @return A new RDD where the RDD of genomic data has been replaced, but the +#' @param ardd The genomic dataset to apply this to. +#' @return A new genomic dataset where the genomic dataset of genomic data has been replaced, but the #' metadata (sequence dictionary, and etc) is copied without modification. #' #' @importFrom SparkR sparkR.callJMethod @@ -286,8 +286,8 @@ setMethod("unpersist", #' Sorts our genome aligned data by reference positions, with contigs ordered #' by index. #' -#' @param ardd The RDD to apply this to. -#' @return Returns a new, sorted RDD, of the implementing class type. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns a new, sorted genomic dataset, of the implementing class type. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -301,8 +301,8 @@ setMethod("sort", #' Sorts our genome aligned data by reference positions, with contigs ordered #' lexicographically. #' -#' @param ardd The RDD to apply this to. -#' @return Returns a new, sorted RDD, of the implementing class type. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns a new, sorted genomic dataset, of the implementing class type. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -315,8 +315,8 @@ setMethod("sortLexicographically", #' Converts this GenomicDataset into a dataframe. #' -#' @param ardd The RDD to convert into a dataframe. -#' @return Returns a dataframe representing this RDD. +#' @param ardd The genomic dataset to convert into a dataframe. +#' @return Returns a dataframe representing this genomic dataset. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -345,9 +345,9 @@ setMethod("wrapTransformation", #' Applies a function that transforms the underlying DataFrame into a new DataFrame #' using the Spark SQL API. #' -#' @param ardd The RDD to apply this to. -#' @param tFn A function that transforms the underlying RDD as a DataFrame. -#' @return A new RDD where the RDD of genomic data has been replaced, but the +#' @param ardd The genomic dataset to apply this to. +#' @param tFn A function that transforms the underlying DataFrame as a DataFrame. +#' @return A new genomic dataset where the DataFrame of genomic data has been replaced, but the #' metadata (sequence dictionary, and etc) is copied without modification. #' #' @importFrom SparkR sparkR.callJMethod @@ -373,19 +373,19 @@ setMethod("inferConversionFn", setMethod("destClassSuffix", signature(destClass = "character"), function(destClass) { - if (destClass == "NucleotideContigFragmentRDD") { + if (destClass == "NucleotideContigFragmentDataset") { "ContigsDatasetConverter" - } else if (destClass == "CoverageRDD") { + } else if (destClass == "CoverageDataset") { "CoverageDatasetConverter" - } else if (destClass == "FeatureRDD") { + } else if (destClass == "FeatureDataset") { "FeaturesDatasetConverter" - } else if (destClass == "FragmentRDD") { + } else if (destClass == "FragmentDataset") { "FragmentDatasetConverter" - } else if (destClass == "AlignmentRecordRDD") { + } else if (destClass == "AlignmentRecordDataset") { "AlignmentRecordDatasetConverter" - } else if (destClass == "GenotypeRDD") { + } else if (destClass == "GenotypeDataset") { "GenotypeDatasetConverter" - } else if (destClass == "VariantRDD") { + } else if (destClass == "VariantDataset") { "VariantDatasetConverter" } else { stop(paste("No conversion method known for", @@ -393,14 +393,14 @@ setMethod("destClassSuffix", } }) -#' Applies a function that transmutes the underlying DataFrame into a new RDD of a +#' Applies a function that transmutes the underlying DataFrame into a new DataFrame of a #' different type. #' -#' @param ardd The RDD to apply this to. -#' @param tFn A function that transforms the underlying RDD as a DataFrame. +#' @param ardd The genomic dataset to apply this to. +#' @param tFn A function that transforms the underlying DataFrame as a DataFrame. #' @param destClass The destination class of this transmutation. #' @param convFn The name of the ADAM GenomicDatasetConversion class to use. -#' @return A new RDD where the RDD of genomic data has been replaced, but the +#' @return A new genomic dataset where the genomic dataset of genomic data has been replaced, but the #' metadata (sequence dictionary, and etc) is copied without modification. #' #' @importFrom SparkR sparkR.callJMethod sparkR.callJStatic @@ -425,19 +425,19 @@ setMethod("transmute", jrdd = sparkR.callJMethod(ardd@jrdd, "transmuteDataFrame", dfFn, convFnInst)) }) -#' Performs a broadcast inner join between this RDD and another RDD. +#' Performs a broadcast inner join between this genomic dataset and another genomic dataset. #' -#' In a broadcast join, the left RDD (this RDD) is collected to the driver, +#' In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, #' and broadcast to all the nodes in the cluster. The key equality function #' used for this join is the reference region overlap function. Since this #' is an inner join, all values who do not overlap a value from the other -#' RDD are dropped. +#' genomic dataset are dropped. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space. #' #' @importFrom SparkR sparkR.callJMethod @@ -453,23 +453,23 @@ setMethod("broadcastRegionJoin", flankSize)) }) -#' Performs a broadcast right outer join between this RDD and another RDD. +#' Performs a broadcast right outer join between this genomic dataset and another genomic dataset. #' -#' In a broadcast join, the left RDD (this RDD) is collected to the driver, +#' In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, #' and broadcast to all the nodes in the cluster. The key equality function #' used for this join is the reference region overlap function. Since this -#' is a right outer join, all values in the left RDD that do not overlap a -#' value from the right RDD are dropped. If a value from the right RDD does -#' not overlap any values in the left RDD, it will be paired with a `None` +#' is a right outer join, all values in the left genomic dataset that do not overlap a +#' value from the right genomic dataset are dropped. If a value from the right genomic dataset does +#' not overlap any values in the left genomic dataset, it will be paired with a `None` #' in the product of the join. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space, and all keys from the -#' right RDD that did not overlap a key in the left RDD. +#' right genomic dataset that did not overlap a key in the left genomic dataset. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -484,19 +484,19 @@ setMethod("rightOuterBroadcastRegionJoin", flankSize)) }) -#' Performs a broadcast inner join between this RDD and another RDD. +#' Performs a broadcast inner join between this genomic dataset and another genomic dataset. #' -#' In a broadcast join, the left RDD (this RDD) is collected to the driver, +#' In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, #' and broadcast to all the nodes in the cluster. The key equality function #' used for this join is the reference region overlap function. Since this #' is an inner join, all values who do not overlap a value from the other -#' RDD are dropped. +#' genomic dataset are dropped. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space. #' #' @importFrom SparkR sparkR.callJMethod @@ -512,23 +512,23 @@ setMethod("broadcastRegionJoinAndGroupByRight", flankSize)) }) -#' Performs a broadcast right outer join between this RDD and another RDD. +#' Performs a broadcast right outer join between this genomic dataset and another genomic dataset. #' -#' In a broadcast join, the left RDD (this RDD) is collected to the driver, +#' In a broadcast join, the left genomic dataset (this genomic dataset) is collected to the driver, #' and broadcast to all the nodes in the cluster. The key equality function #' used for this join is the reference region overlap function. Since this -#' is a right outer join, all values in the left RDD that do not overlap a -#' value from the right RDD are dropped. If a value from the right RDD does -#' not overlap any values in the left RDD, it will be paired with a `None` +#' is a right outer join, all values in the left genomic dataset that do not overlap a +#' value from the right genomic dataset are dropped. If a value from the right genomic dataset does +#' not overlap any values in the left genomic dataset, it will be paired with a `None` #' in the product of the join. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space, and all keys from the -#' right RDD that did not overlap a key in the left RDD. +#' right genomic dataset that did not overlap a key in the left genomic dataset. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -543,19 +543,19 @@ setMethod("rightOuterBroadcastRegionJoinAndGroupByRight", flankSize)) }) -#' Performs a sort-merge inner join between this RDD and another RDD. +#' Performs a sort-merge inner join between this genomic dataset and another genomic dataset. #' -#' In a sort-merge join, both RDDs are co-partitioned and sorted. The +#' In a sort-merge join, both genomic datasets are co-partitioned and sorted. The #' partitions are then zipped, and we do a merge join on each partition. #' The key equality function used for this join is the reference region #' overlap function. Since this is an inner join, all values who do not -#' overlap a value from the other RDD are dropped. +#' overlap a value from the other genomic dataset are dropped. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space. #' #' @importFrom SparkR sparkR.callJMethod @@ -571,23 +571,23 @@ setMethod("shuffleRegionJoin", flankSize)) }) -#' Performs a sort-merge right outer join between this RDD and another RDD. +#' Performs a sort-merge right outer join between this genomic dataset and another genomic dataset. #' -#' In a sort-merge join, both RDDs are co-partitioned and sorted. The +#' In a sort-merge join, both genomic datasets are co-partitioned and sorted. The #' partitions are then zipped, and we do a merge join on each partition. #' The key equality function used for this join is the reference region #' overlap function. Since this is a right outer join, all values in the -#' left RDD that do not overlap a value from the right RDD are dropped. -#' If a value from the right RDD does not overlap any values in the left -#' RDD, it will be paired with a `None` in the product of the join. +#' left genomic dataset that do not overlap a value from the right genomic dataset are dropped. +#' If a value from the right genomic dataset does not overlap any values in the left +#' genomic dataset, it will be paired with a `None` in the product of the join. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space, and all keys from the -#' right RDD that did not overlap a key in the left RDD. +#' right genomic dataset that did not overlap a key in the left genomic dataset. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -602,23 +602,23 @@ setMethod("rightOuterShuffleRegionJoin", flankSize)) }) -#' Performs a sort-merge left outer join between this RDD and another RDD. +#' Performs a sort-merge left outer join between this genomic dataset and another genomic dataset. #' -#' In a sort-merge join, both RDDs are co-partitioned and sorted. The +#' In a sort-merge join, both genomic datasets are co-partitioned and sorted. The #' partitions are then zipped, and we do a merge join on each partition. #' The key equality function used for this join is the reference region #' overlap function. Since this is a left outer join, all values in the -#' right RDD that do not overlap a value from the left RDD are dropped. -#' If a value from the left RDD does not overlap any values in the right -#' RDD, it will be paired with a `None` in the product of the join. +#' right genomic dataset that do not overlap a value from the left genomic dataset are dropped. +#' If a value from the left genomic dataset does not overlap any values in the right +#' genomic dataset, it will be paired with a `None` in the product of the join. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space, and all keys from the -#' left RDD that did not overlap a key in the left RDD. +#' left genomic dataset that did not overlap a key in the left genomic dataset. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -633,24 +633,24 @@ setMethod("leftOuterShuffleRegionJoin", flankSize)) }) -#' Performs a sort-merge left outer join between this RDD and another RDD, +#' Performs a sort-merge left outer join between this genomic dataset and another genomic dataset, #' followed by a groupBy on the left value. #' -#' In a sort-merge join, both RDDs are co-partitioned and sorted. The +#' In a sort-merge join, both genomic datasets are co-partitioned and sorted. The #' partitions are then zipped, and we do a merge join on each partition. #' The key equality function used for this join is the reference region #' overlap function. Since this is a left outer join, all values in the -#' right RDD that do not overlap a value from the left RDD are dropped. -#' If a value from the left RDD does not overlap any values in the right -#' RDD, it will be paired with an empty Iterable in the product of the join. +#' right genomic dataset that do not overlap a value from the left genomic dataset are dropped. +#' If a value from the left genomic dataset does not overlap any values in the right +#' genomic dataset, it will be paired with an empty Iterable in the product of the join. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space, and all keys from the -#' left RDD that did not overlap a key in the left RDD. +#' left genomic dataset that did not overlap a key in the left genomic dataset. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -665,20 +665,20 @@ setMethod("leftOuterShuffleRegionJoinAndGroupByLeft", flankSize)) }) -#' Performs a sort-merge full outer join between this RDD and another RDD. +#' Performs a sort-merge full outer join between this genomic dataset and another genomic dataset. #' -#' In a sort-merge join, both RDDs are co-partitioned and sorted. The +#' In a sort-merge join, both genomic datasets are co-partitioned and sorted. The #' partitions are then zipped, and we do a merge join on each partition. #' The key equality function used for this join is the reference region #' overlap function. Since this is a full outer join, if a value from either -#' RDD does not overlap any values in the other RDD, it will be paired with +#' genomic dataset does not overlap any values in the other genomic dataset, it will be paired with #' a `None` in the product of the join. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space, and values that did not #' overlap will be paired with a `None`. #' @@ -695,23 +695,23 @@ setMethod("fullOuterShuffleRegionJoin", flankSize)) }) -#' Performs a sort-merge right outer join between this RDD and another RDD, +#' Performs a sort-merge right outer join between this genomic dataset and another genomic dataset, #' followed by a groupBy on the left value. #' -#' In a sort-merge join, both RDDs are co-partitioned and sorted. The +#' In a sort-merge join, both genomic datasets are co-partitioned and sorted. The #' partitions are then zipped, and we do a merge join on each partition. #' The key equality function used for this join is the reference region #' overlap function. Since this is a right outer join, all values from the -#' right RDD who did not overlap a value from the left RDD are placed into +#' right genomic dataset who did not overlap a value from the left genomic dataset are placed into #' a length-1 Iterable with a `None` key. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space, and all values from the -#' right RDD that did not overlap an item in the left RDD. +#' right genomic dataset that did not overlap an item in the left genomic dataset. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -726,22 +726,22 @@ setMethod("rightOuterShuffleRegionJoinAndGroupByLeft", flankSize)) }) -#' Performs a sort-merge inner join between this RDD and another RDD, +#' Performs a sort-merge inner join between this genomic dataset and another genomic dataset, #' followed by a groupBy on the left value. #' -#' In a sort-merge join, both RDDs are co-partitioned and sorted. The +#' In a sort-merge join, both genomic datasets are co-partitioned and sorted. The #' partitions are then zipped, and we do a merge join on each partition. #' The key equality function used for this join is the reference region #' overlap function. In the same operation, we group all values by the left -#' item in the RDD. +#' item in the genomic dataset. #' -#' @param ardd The left RDD in the join. -#' @param genomicRdd The right RDD in the join. +#' @param ardd The left genomic dataset in the join. +#' @param genomicRdd The right genomic dataset in the join. #' @param flankSize Sets a flankSize for the distance between elements to be #' joined. If set to 0, an overlap is required to join two elements. -#' @return Returns a new genomic RDD containing all pairs of keys that +#' @return Returns a new genomic dataset containing all pairs of keys that #' overlapped in the genomic coordinate space, grouped together by -#' the value they overlapped in the left RDD. +#' the value they overlapped in the left genomic dataset. #' #' @importFrom SparkR sparkR.callJMethod #' @@ -757,14 +757,14 @@ setMethod("shuffleRegionJoinAndGroupByLeft", }) setMethod("replaceRdd", - signature(ardd = "AlignmentRecordRDD", + signature(ardd = "AlignmentRecordDataset", rdd = "jobj"), function(ardd, rdd) { - AlignmentRecordRDD(rdd) + AlignmentRecordDataset(rdd) }) setMethod("inferConversionFn", - signature(ardd = "AlignmentRecordRDD", + signature(ardd = "AlignmentRecordDataset", destClass = "character"), function(ardd, destClass) { paste0("org.bdgenomics.adam.api.java.AlignmentRecordsTo", @@ -773,22 +773,22 @@ setMethod("inferConversionFn", #' Convert this set of reads into fragments. #' -#' @param ardd The RDD to apply this to. -#' @return Returns a FragmentRDD where all reads have been grouped together by +#' @param ardd The genomic dataset to apply this to. +#' @return Returns a FragmentDataset where all reads have been grouped together by #' the original sequence fragment they come from. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export setMethod("toFragments", - signature(ardd = "AlignmentRecordRDD"), + signature(ardd = "AlignmentRecordDataset"), function(ardd) { - FragmentRDD(sparkR.callJMethod(ardd@jrdd, "toFragments")) + FragmentDataset(sparkR.callJMethod(ardd@jrdd, "toFragments")) }) -#' Saves this RDD to disk as a SAM/BAM/CRAM file. +#' Saves this genomic dataset to disk as a SAM/BAM/CRAM file. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath The path to save the file to. #' @param asType The type of file to save. Valid choices are SAM, BAM, #' CRAM, and NA. If None, the file type is inferred from the extension. @@ -800,7 +800,7 @@ setMethod("toFragments", #' #' @export setMethod("saveAsSam", - signature(ardd = "AlignmentRecordRDD", filePath = "character"), + signature(ardd = "AlignmentRecordDataset", filePath = "character"), function(ardd, filePath, asType = NA, @@ -825,25 +825,25 @@ setMethod("saveAsSam", isSorted)) }) -#' Converts this set of reads into a corresponding CoverageRDD. +#' Converts this set of reads into a corresponding CoverageDataset. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param collapse Determines whether to merge adjacent coverage elements with #' the same score to a single coverage observation. -#' @return Returns an RDD with observed coverage. +#' @return Returns a genomic dataset with observed coverage. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export setMethod("toCoverage", - signature(ardd = "AlignmentRecordRDD"), + signature(ardd = "AlignmentRecordDataset"), function(ardd, collapse = TRUE) { - CoverageRDD(sparkR.callJMethod(ardd@jrdd, "toCoverage", collapse)) + CoverageDataset(sparkR.callJMethod(ardd@jrdd, "toCoverage", collapse)) }) -#' Saves this RDD to disk, with the type identified by the extension. +#' Saves this genomic dataset to disk, with the type identified by the extension. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath The path to save the file to. #' @param isSorted Whether the file is sorted or not. #' @@ -851,14 +851,14 @@ setMethod("toCoverage", #' #' @export setMethod("save", - signature(ardd = "AlignmentRecordRDD", filePath = "character"), + signature(ardd = "AlignmentRecordDataset", filePath = "character"), function(ardd, filePath, isSorted = FALSE) { invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath, isSorted)) }) #' Cuts reads into _k_-mers, and then counts the occurrences of each _k_-mer. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param kmerLength The value of _k_ to use for cutting _k_-mers. #' @return Returns a DataFrame containing k-mer/count pairs. #' @@ -866,7 +866,7 @@ setMethod("save", #' #' @export setMethod("countKmers", - signature(ardd = "AlignmentRecordRDD", kmerLength = "numeric"), + signature(ardd = "AlignmentRecordDataset", kmerLength = "numeric"), function(ardd, kmerLength) { new("SparkDataFrame", sparkR.callJMethod(sparkR.callJMethod(ardd@jrdd, @@ -883,16 +883,16 @@ setMethod("countKmers", #' put at the end and sorted by read name. Contigs are ordered lexicographically #' by name. #' -#' @param ardd The RDD to apply this to. -#' @return A new, sorted AlignmentRecordRDD. +#' @param ardd The genomic dataset to apply this to. +#' @return A new, sorted AlignmentRecordDataset. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export setMethod("sortReadsByReferencePosition", - signature(ardd = "AlignmentRecordRDD"), + signature(ardd = "AlignmentRecordDataset"), function(ardd) { - AlignmentRecordRDD(sparkR.callJMethod(ardd@jrdd, "sortReadsByReferencePosition")) + AlignmentRecordDataset(sparkR.callJMethod(ardd@jrdd, "sortReadsByReferencePosition")) }) #' Sorts our read data by reference positions, with contigs ordered by index. @@ -901,31 +901,31 @@ setMethod("sortReadsByReferencePosition", #' put at the end and sorted by read name. Contigs are ordered by index that #' they are ordered in the sequence metadata. #' -#' @param ardd The RDD to apply this to. -#' @return A new, sorted AlignmentRecordRDD. +#' @param ardd The genomic dataset to apply this to. +#' @return A new, sorted AlignmentRecordDataset. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export setMethod("sortReadsByReferencePositionAndIndex", - signature(ardd = "AlignmentRecordRDD"), + signature(ardd = "AlignmentRecordDataset"), function(ardd) { - AlignmentRecordRDD(sparkR.callJMethod(ardd@jrdd, "sortReadsByReferencePositionAndIndex")) + AlignmentRecordDataset(sparkR.callJMethod(ardd@jrdd, "sortReadsByReferencePositionAndIndex")) }) #' Marks reads as possible fragment duplicates. #' -#' @param ardd The RDD to apply this to. -#' @return A new RDD where reads have the duplicate read flag set. Duplicate +#' @param ardd The genomic dataset to apply this to. +#' @return A new genomic dataset where reads have the duplicate read flag set. Duplicate #' reads are NOT filtered out. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export setMethod("markDuplicates", - signature(ardd = "AlignmentRecordRDD"), + signature(ardd = "AlignmentRecordDataset"), function(ardd) { - AlignmentRecordRDD(sparkR.callJMethod(ardd@jrdd, "markDuplicates")) + AlignmentRecordDataset(sparkR.callJMethod(ardd@jrdd, "markDuplicates")) }) #' Runs base quality score recalibration on a set of reads. @@ -933,7 +933,7 @@ setMethod("markDuplicates", #' Uses a table of known SNPs to mask true variation during the recalibration #' process. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param knownSnps A table of known SNPs to mask valid variants. #' @param validationStringency The stringency to apply towards validating BQSR. #' @@ -941,10 +941,10 @@ setMethod("markDuplicates", #' #' @export setMethod("recalibrateBaseQualities", - signature(ardd = "AlignmentRecordRDD", knownSnps = "VariantRDD", validationStringency = "character"), + signature(ardd = "AlignmentRecordDataset", knownSnps = "VariantDataset", validationStringency = "character"), function(ardd, knownSnps, validationStringency) { stringency <- sparkR.callJStatic("htsjdk.samtools.ValidationStringency", "valueOf", validationStringency) - AlignmentRecordRDD(sparkR.callJMethod(ardd@jrdd, "recalibrateBaseQualities", knownSnps@jrdd, stringency)) + AlignmentRecordDataset(sparkR.callJMethod(ardd@jrdd, "recalibrateBaseQualities", knownSnps@jrdd, stringency)) }) #' Realigns indels using a consensus-based heuristic. @@ -952,7 +952,7 @@ setMethod("recalibrateBaseQualities", #' If no known indels are provided, generates consensuses from reads. Else, #' generates consensuses from previously seen variants. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param isSorted If the input data is sorted, setting this parameter to true #' avoids a second sort. #' @param maxIndelSize The size of the largest indel to use for realignment. @@ -962,14 +962,14 @@ setMethod("recalibrateBaseQualities", #' are only finalized if the log-odds threshold is exceeded. #' @param maxTargetSize The maximum width of a single target region for #' realignment. -#' @param knownIndels An RDD of previously called INDEL variants. -#' @return Returns an RDD of mapped reads which have been realigned. +#' @param knownIndels A genomic dataset of previously called INDEL variants. +#' @return Returns a genomic dataset of mapped reads which have been realigned. #' #' @importFrom SparkR sparkR.callJMethod sparkR.callJStatic #' #' @export setMethod("realignIndels", - signature(ardd = "AlignmentRecordRDD"), + signature(ardd = "AlignmentRecordDataset"), function(ardd, isSorted = FALSE, maxIndelSize = 500, maxConsensusNumber = 30, lodThreshold = 5.0, maxTargetSize = 3000, @@ -978,7 +978,7 @@ setMethod("realignIndels", if (is.na(knownIndels)) { consensusModel <- sparkR.callJStatic("org.bdgenomics.adam.algorithms.consensus.ConsensusGenerator", "fromKnowns", knownIndels@jrdd) - AlignmentRecordRDD(sparkR.callJMethod(ardd@jrdd, "realignIndels", + AlignmentRecordDataset(sparkR.callJMethod(ardd@jrdd, "realignIndels", consensusModel, isSorted, maxIndelSize, @@ -989,7 +989,7 @@ setMethod("realignIndels", } else { consensusModel <- sparkR.callJStatic("org.bdgenomics.adam.algorithms.consensus.ConsensusGenerator", "fromReads") - AlignmentRecordRDD(sparkR.callJMethod(ardd@jrdd, "realignIndels", + AlignmentRecordDataset(sparkR.callJMethod(ardd@jrdd, "realignIndels", consensusModel, isSorted, maxIndelSize, @@ -1001,7 +1001,7 @@ setMethod("realignIndels", #' Saves coverage as a feature file. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath The location to write the output. #' @param asSingleFile If true, merges the sharded output into a single file. #' @@ -1009,7 +1009,7 @@ setMethod("realignIndels", #' #' @export setMethod("save", - signature(ardd = "CoverageRDD", filePath = "character"), + signature(ardd = "CoverageDataset", filePath = "character"), function(ardd, filePath, asSingleFile = FALSE) { invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath, asSingleFile)) }) @@ -1017,32 +1017,32 @@ setMethod("save", #' Merges adjacent ReferenceRegions with the same coverage value. #' #' This reduces the loss of coverage information while reducing the number of -#' of records in the RDD. For example, adjacent records Coverage("chr1", 1, 10, +#' of records in the genomic dataset. For example, adjacent records Coverage("chr1", 1, 10, #' 3.0) and Coverage("chr1", 10, 20, 3.0) would be merged into one record #' Coverage("chr1", 1, 20, 3.0). #' -#' @param ardd The RDD to apply this to. -#' @return An RDD with merged tuples of adjacent sites with same coverage. +#' @param ardd The genomic dataset to apply this to. +#' @return A genomic dataset with merged tuples of adjacent sites with same coverage. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("collapse", signature(ardd = "CoverageRDD"), +setMethod("collapse", signature(ardd = "CoverageDataset"), function(ardd) { - CoverageRDD(sparkR.callJMethod(ardd@jrdd, "collapse")) + CoverageDataset(sparkR.callJMethod(ardd@jrdd, "collapse")) }) -#' Converts CoverageRDD to FeatureRDD. +#' Converts CoverageDataset to FeatureDataset. #' -#' @param ardd The RDD to apply this to. -#' @return Returns a FeatureRDD from a CoverageRDD. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns a FeatureDataset from a CoverageDataset. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("toFeatures", signature(ardd = "CoverageRDD"), +setMethod("toFeatures", signature(ardd = "CoverageDataset"), function(ardd) { - FeatureRDD(sparkR.callJMethod(ardd@jrdd, "toFeatures")) + FeatureDataset(sparkR.callJMethod(ardd@jrdd, "toFeatures")) }) #' Gets coverage overlapping specified ReferenceRegion. @@ -1051,16 +1051,16 @@ setMethod("toFeatures", signature(ardd = "CoverageRDD"), #' bin together ReferenceRegions of equal size. The coverage of each bin is the #' coverage of the first base pair in that bin. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param bpPerBin Number of bases to combine to one bin. -#' @return Returns a sparsified CoverageRDD. +#' @return Returns a sparsified CoverageDataset. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("coverage", signature(ardd = "CoverageRDD"), +setMethod("coverage", signature(ardd = "CoverageDataset"), function(ardd, bpPerBin = 1) { - CoverageRDD(sparkR.callJMethod(ardd@jrdd, "coverage", bpPerBin)) + CoverageDataset(sparkR.callJMethod(ardd@jrdd, "coverage", bpPerBin)) }) #' Gets coverage overlapping specified ReferenceRegion. @@ -1069,37 +1069,37 @@ setMethod("coverage", signature(ardd = "CoverageRDD"), #' bin together ReferenceRegions of equal size. The coverage of each bin is the #' average coverage of the bases in that bin. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param bpPerBin Number of bases to combine to one bin. -#' @return Returns a sparsified CoverageRDD. +#' @return Returns a sparsified CoverageDataset. #' -#' @rdname CoverageRDD +#' @rdname CoverageDataset #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("aggregatedCoverage", signature(ardd = "CoverageRDD"), +setMethod("aggregatedCoverage", signature(ardd = "CoverageDataset"), function(ardd, bpPerBin = 1) { - CoverageRDD(sparkR.callJMethod(ardd@jrdd, "aggregatedCoverage", bpPerBin)) + CoverageDataset(sparkR.callJMethod(ardd@jrdd, "aggregatedCoverage", bpPerBin)) }) -#' Gets flattened RDD of coverage, with coverage mapped to each base pair. +#' Gets flattened genomic dataset of coverage, with coverage mapped to each base pair. #' #' The opposite operation of collapse. #' -#' @param ardd The RDD to apply this to. -#' @return New CoverageRDD of flattened coverage. +#' @param ardd The genomic dataset to apply this to. +#' @return New CoverageDataset of flattened coverage. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("flatten", signature(ardd = "CoverageRDD"), +setMethod("flatten", signature(ardd = "CoverageDataset"), function(ardd) { - CoverageRDD(sparkR.callJMethod(ardd@jrdd, "flatten")) + CoverageDataset(sparkR.callJMethod(ardd@jrdd, "flatten")) }) setMethod("inferConversionFn", - signature(ardd = "FeatureRDD", + signature(ardd = "FeatureDataset", destClass = "character"), function(ardd, destClass) { paste0("org.bdgenomics.adam.api.java.FeaturesTo", @@ -1107,10 +1107,10 @@ setMethod("inferConversionFn", }) setMethod("replaceRdd", - signature(ardd = "FeatureRDD", + signature(ardd = "FeatureDataset", rdd = "jobj"), function(ardd, rdd) { - FeatureRDD(rdd) + FeatureDataset(rdd) }) #' Saves coverage, autodetecting the file type from the extension. @@ -1120,7 +1120,7 @@ setMethod("replaceRdd", #' these match, we fall back to Parquet. These files are written as sharded text #' files, which can be merged by passing asSingleFile = True. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath The location to write the output. #' @param asSingleFile If true, merges the sharded output into a single file. #' @param disableFastConcat If asSingleFile is true, disables the use of the @@ -1130,28 +1130,28 @@ setMethod("replaceRdd", #' #' @export setMethod("save", - signature(ardd = "FeatureRDD", filePath = "character"), + signature(ardd = "FeatureDataset", filePath = "character"), function(ardd, filePath, asSingleFile = FALSE, disableFastConcat = FALSE) { invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath, asSingleFile, disableFastConcat)) }) -#' Converts the FeatureRDD to a CoverageRDD. +#' Converts the FeatureDataset to a CoverageDataset. #' -#' @param ardd The RDD to apply this to. -#' @return Returns a new CoverageRDD. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns a new CoverageDataset. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("toCoverage", signature(ardd = "FeatureRDD"), +setMethod("toCoverage", signature(ardd = "FeatureDataset"), function(ardd) { - CoverageRDD(sparkR.callJMethod(ardd@jrdd, "toCoverage")) + CoverageDataset(sparkR.callJMethod(ardd@jrdd, "toCoverage")) }) setMethod("inferConversionFn", - signature(ardd = "FragmentRDD", + signature(ardd = "FragmentDataset", destClass = "character"), function(ardd, destClass) { paste0("org.bdgenomics.adam.api.java.FragmentsTo", @@ -1159,54 +1159,54 @@ setMethod("inferConversionFn", }) setMethod("replaceRdd", - signature(ardd = "FragmentRDD", + signature(ardd = "FragmentDataset", rdd = "jobj"), function(ardd, rdd) { - FragmentRDD(rdd) + FragmentDataset(rdd) }) -#' Splits up the reads in a Fragment, and creates a new RDD. +#' Splits up the reads in a Fragment, and creates a new genomic dataset. #' -#' @param ardd The RDD to apply this to. -#' @return Returns this RDD converted back to reads. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns this genomic dataset converted back to reads. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("toReads", signature(ardd = "FragmentRDD"), +setMethod("toReads", signature(ardd = "FragmentDataset"), function(ardd) { - AlignmentRecordRDD(sparkR.callJMethod(ardd@jrdd, "toReads")) + AlignmentRecordDataset(sparkR.callJMethod(ardd@jrdd, "toReads")) }) #' Marks reads as possible fragment duplicates. #' -#' @param ardd The RDD to apply this to. -#' @return A new RDD where reads have the duplicate read flag set. Duplicate +#' @param ardd The genomic dataset to apply this to. +#' @return A new genomic dataset where reads have the duplicate read flag set. Duplicate #' reads are NOT filtered out. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("markDuplicates", signature(ardd = "FragmentRDD"), +setMethod("markDuplicates", signature(ardd = "FragmentDataset"), function(ardd) { - FragmentRDD(sparkR.callJMethod(ardd@jrdd, "markDuplicates")) + FragmentDataset(sparkR.callJMethod(ardd@jrdd, "markDuplicates")) }) #' Saves fragments to Parquet. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath Path to save fragments to. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("save", signature(ardd = "FragmentRDD", filePath = "character"), +setMethod("save", signature(ardd = "FragmentDataset", filePath = "character"), function(ardd, filePath) { invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath)) }) setMethod("inferConversionFn", - signature(ardd = "GenotypeRDD", + signature(ardd = "GenotypeDataset", destClass = "character"), function(ardd, destClass) { paste0("org.bdgenomics.adam.api.java.GenotypesTo", @@ -1214,58 +1214,58 @@ setMethod("inferConversionFn", }) setMethod("replaceRdd", - signature(ardd = "GenotypeRDD", + signature(ardd = "GenotypeDataset", rdd = "jobj"), function(ardd, rdd) { - GenotypeRDD(rdd) + GenotypeDataset(rdd) }) -#' Saves this RDD of genotypes to disk as Parquet. +#' Saves this genomic dataset of genotypes to disk as Parquet. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath Path to save file to. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("saveAsParquet", signature(ardd = "GenotypeRDD", filePath = "character"), +setMethod("saveAsParquet", signature(ardd = "GenotypeDataset", filePath = "character"), function(ardd, filePath) { invisible(sparkR.callJMethod(ardd@jrdd, "saveAsParquet", filePath)) }) -#' Extracts the variants contained in this RDD of genotypes. +#' Extracts the variants contained in this genomic dataset of genotypes. #' #' Does not perform any filtering looking at whether the variant was called or #' not. By default, does not deduplicate variants. #' #' @param dedupe If true, drops variants described in more than one genotype #' record. -#' @return Returns the variants described by this GenotypeRDD. +#' @return Returns the variants described by this GenotypeDataset. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("toVariants", signature(ardd = "GenotypeRDD"), +setMethod("toVariants", signature(ardd = "GenotypeDataset"), function(ardd, dedupe=FALSE) { - VariantRDD(sparkR.callJMethod(ardd@jrdd, "toVariants", dedupe)) + VariantDataset(sparkR.callJMethod(ardd@jrdd, "toVariants", dedupe)) }) -#' Converts this RDD of Genotypes to VariantContexts. +#' Converts this genomic dataset of Genotypes to VariantContexts. #' -#' @param ardd The RDD to apply this to. -#' @return Returns this RDD of Genotypes as VariantContexts. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns this genomic dataset of Genotypes as VariantContexts. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("toVariantContexts", signature(ardd = "GenotypeRDD"), +setMethod("toVariantContexts", signature(ardd = "GenotypeDataset"), function(ardd) { - VariantContextRDD(sparkR.callJMethod(ardd@jrdd, "toVariantContexts")) + VariantContextDataset(sparkR.callJMethod(ardd@jrdd, "toVariantContexts")) }) setMethod("inferConversionFn", - signature(ardd = "NucleotideContigFragmentRDD", + signature(ardd = "NucleotideContigFragmentDataset", destClass = "character"), function(ardd, destClass) { paste0("org.bdgenomics.adam.api.java.ContigsTo", @@ -1273,10 +1273,10 @@ setMethod("inferConversionFn", }) setMethod("replaceRdd", - signature(ardd = "NucleotideContigFragmentRDD", + signature(ardd = "NucleotideContigFragmentDataset", rdd = "jobj"), function(ardd, rdd) { - NucleotideContigFragmentRDD(rdd) + NucleotideContigFragmentDataset(rdd) }) #' Save nucleotide contig fragments as Parquet or FASTA. @@ -1284,38 +1284,38 @@ setMethod("replaceRdd", #' If filename ends in .fa or .fasta, saves as Fasta. If not, saves fragments to #' Parquet. Defaults to 60 character line length, if saving as FASTA. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath Path to save to. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("save", signature(ardd = "NucleotideContigFragmentRDD", filePath = "character"), +setMethod("save", signature(ardd = "NucleotideContigFragmentDataset", filePath = "character"), function(ardd, filePath) { invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath)) }) -#' For all adjacent records in the RDD, we extend the records so that the +#' For all adjacent records in the genomic dataset, we extend the records so that the #' adjacent records now overlap by _n_ bases, where _n_ is the flank length. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param flankLength The length to extend adjacent records by. -#' @return Returns the RDD, with all adjacent fragments extended with flanking +#' @return Returns the genomic dataset, with all adjacent fragments extended with flanking #' sequence. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export setMethod("flankAdjacentFragments", - signature(ardd = "NucleotideContigFragmentRDD", flankLength = "numeric"), + signature(ardd = "NucleotideContigFragmentDataset", flankLength = "numeric"), function(ardd, flankLength) { - NucleotideContigFragmentRDD(sparkR.callJMethod(ardd@jrdd, + NucleotideContigFragmentDataset(sparkR.callJMethod(ardd@jrdd, "flankAdjacentFragments", flankLength)) }) setMethod("inferConversionFn", - signature(ardd = "VariantRDD", + signature(ardd = "VariantDataset", destClass = "character"), function(ardd, destClass) { paste0("org.bdgenomics.adam.api.java.VariantsTo", @@ -1323,48 +1323,48 @@ setMethod("inferConversionFn", }) setMethod("replaceRdd", - signature(ardd = "VariantRDD", + signature(ardd = "VariantDataset", rdd = "jobj"), function(ardd, rdd) { - VariantRDD(rdd) + VariantDataset(rdd) }) -#' Saves this RDD of variants to disk as Parquet. +#' Saves this genomic dataset of variants to disk as Parquet. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath Path to save file to. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("saveAsParquet", signature(ardd = "VariantRDD", filePath = "character"), +setMethod("saveAsParquet", signature(ardd = "VariantDataset", filePath = "character"), function(ardd, filePath) { invisible(sparkR.callJMethod(ardd@jrdd, "saveAsParquet", filePath)) }) -#' Converts this RDD of Variants to VariantContexts. +#' Converts this genomic dataset of Variants to VariantContexts. #' -#' @param ardd The RDD to apply this to. -#' @return Returns this RDD of Variants as VariantContexts. +#' @param ardd The genomic dataset to apply this to. +#' @return Returns this genomic dataset of Variants as VariantContexts. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("toVariantContexts", signature(ardd = "VariantRDD"), +setMethod("toVariantContexts", signature(ardd = "VariantDataset"), function(ardd) { - VariantContextRDD(sparkR.callJMethod(ardd@jrdd, "toVariantContexts")) + VariantContextDataset(sparkR.callJMethod(ardd@jrdd, "toVariantContexts")) }) setMethod("replaceRdd", - signature(ardd = "VariantContextRDD", + signature(ardd = "VariantContextDataset", rdd = "jobj"), function(ardd, rdd) { - VariantContextRDD(rdd) + VariantContextDataset(rdd) }) -#' Saves this RDD of variant contexts to disk as VCF +#' Saves this genomic dataset of variant contexts to disk as VCF. #' -#' @param ardd The RDD to apply this to. +#' @param ardd The genomic dataset to apply this to. #' @param filePath Path to save VCF to. #' @param asSingleFile If true, saves the output as a single file #' by merging the sharded output after saving. @@ -1377,7 +1377,7 @@ setMethod("replaceRdd", #' @importFrom SparkR sparkR.callJMethod sparkR.callJStatic #' #' @export -setMethod("saveAsVcf", signature(ardd = "VariantContextRDD", filePath = "character"), +setMethod("saveAsVcf", signature(ardd = "VariantContextDataset", filePath = "character"), function(ardd, filePath, asSingleFile = TRUE, diff --git a/adam-r/bdgenomics.adam/tests/testthat/test_alignmentRecordRdd.R b/adam-r/bdgenomics.adam/tests/testthat/test_alignmentRecordDataset.R similarity index 98% rename from adam-r/bdgenomics.adam/tests/testthat/test_alignmentRecordRdd.R rename to adam-r/bdgenomics.adam/tests/testthat/test_alignmentRecordDataset.R index 8fa01c16b4..80d2162331 100644 --- a/adam-r/bdgenomics.adam/tests/testthat/test_alignmentRecordRdd.R +++ b/adam-r/bdgenomics.adam/tests/testthat/test_alignmentRecordDataset.R @@ -101,9 +101,9 @@ test_that("transmute to coverage", { readsAsCoverage = transmute(reads, function(df) { select(df, df$contigName, df$start, df$end, alias(cast(df$mapq, "double"), "count"), alias(cast(df$recordGroupSample, "string"), "optSampleId")) - }, "CoverageRDD") + }, "CoverageDataset") - expect_true(is(readsAsCoverage, "CoverageRDD")) + expect_true(is(readsAsCoverage, "CoverageDataset")) expect_equal(count(toDF(readsAsCoverage)), 5) }) diff --git a/adam-r/bdgenomics.adam/tests/testthat/test_featureRdd.R b/adam-r/bdgenomics.adam/tests/testthat/test_featureDataset.R similarity index 100% rename from adam-r/bdgenomics.adam/tests/testthat/test_featureRdd.R rename to adam-r/bdgenomics.adam/tests/testthat/test_featureDataset.R diff --git a/adam-r/bdgenomics.adam/tests/testthat/test_genotypeRdd.R b/adam-r/bdgenomics.adam/tests/testthat/test_genotypeDataset.R similarity index 100% rename from adam-r/bdgenomics.adam/tests/testthat/test_genotypeRdd.R rename to adam-r/bdgenomics.adam/tests/testthat/test_genotypeDataset.R diff --git a/adam-r/bdgenomics.adam/tests/testthat/test_variantRdd.R b/adam-r/bdgenomics.adam/tests/testthat/test_variantDataset.R similarity index 100% rename from adam-r/bdgenomics.adam/tests/testthat/test_variantRdd.R rename to adam-r/bdgenomics.adam/tests/testthat/test_variantDataset.R diff --git a/docs/api/adamContext.rst b/docs/api/adamContext.rst index ea85eaf624..a13fbde68a 100644 --- a/docs/api/adamContext.rst +++ b/docs/api/adamContext.rst @@ -14,9 +14,9 @@ use this, import the implicit, and call an ``ADAMContext`` method: // the ._ at the end imports the implicit from the ADAMContext companion object import org.bdgenomics.adam.rdd.ADAMContext._ - import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD + import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset - def loadReads(filePath: String, sc: SparkContext): AlignmentRecordRDD = { + def loadReads(filePath: String, sc: SparkContext): AlignmentRecordDataset = { sc.loadAlignments(filePath) } @@ -27,12 +27,12 @@ In Java, instantiate a JavaADAMContext, which wraps an ADAMContext: import org.apache.spark.apis.java.JavaSparkContext; import org.bdgenomics.adam.apis.java.JavaADAMContext; import org.bdgenomics.adam.rdd.ADAMContext; - import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD; + import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset; class LoadReads { - public static AlignmentRecordRDD loadReads(String filePath, - JavaSparkContext jsc) { + public static AlignmentRecordDataset loadReads(String filePath, + JavaSparkContext jsc) { // create an ADAMContext first ADAMContext ac = new ADAMContext(jsc.sc()); @@ -55,7 +55,7 @@ From Python, instantiate an ADAMContext, which wraps a SparkContext: With an ``ADAMContext``, you can load: -- Single reads as an ``AlignmentRecordRDD``: +- Single reads as an ``AlignmentRecordDataset``: - From SAM/BAM/CRAM using ``loadBam`` (Scala only) - Selected regions from an indexed BAM/CRAM using ``loadIndexedBam`` (Scala, Java, and Python) @@ -65,7 +65,7 @@ With an ``ADAMContext``, you can load: - The ``loadAlignments`` method will load from any of the above formats, and will autodetect the underlying format (Scala, Java, Python, and R, also supports loading reads from FASTA) -- Paired reads as a ``FragmentRDD``: +- Paired reads as a ``FragmentDataset``: - From interleaved FASTQ using ``loadInterleavedFastqAsFragments`` (Scala only) - From Parquet using ``loadParquetFragments`` (Scala only) @@ -73,23 +73,23 @@ With an ``ADAMContext``, you can load: and will autodetect the underlying file format. If the file is a SAM/BAM/CRAM file and the file is queryname sorted, the data will be converted to fragments without performing a shuffle. (Scala, Java, Python, and R) -- All of the genotypes associated with a variant as a ``VariantContextRDD`` from Parquet +- All of the genotypes associated with a variant as a ``VariantContextDataset`` from Parquet using ``loadParquetVariantContexts`` (Scala only) -- VCF lines as a ``VariantContextRDD`` from VCF/BCF1 using ``loadVcf`` (Scala only) +- VCF lines as a ``VariantContextDataset`` from VCF/BCF1 using ``loadVcf`` (Scala only) - Selected lines from a tabix indexed VCF using ``loadIndexedVcf`` (Scala only) -- Genotypes as a ``GenotypeRDD``: +- Genotypes as a ``GenotypeDataset``: - From Parquet using ``loadParquetGenotypes`` (Scala only) - From partitioned Parquet using ``loadPartitionedParquetGenotypes`` (Scala only) - From either Parquet or VCF/BCF1 using ``loadGenotypes`` (Scala, Java, Python, and R) -- Variants as a ``VariantRDD``: +- Variants as a ``VariantDataset``: - From Parquet using ``loadParquetVariants`` (Scala only) - From partitioned Parquet using ``loadPartitionedParquetVariants`` (Scala only) - From either Parquet or VCF/BCF1 using ``loadVariants`` (Scala, Java, Python, and R) -- Genomic features as a ``FeatureRDD``: +- Genomic features as a ``FeatureDataset``: - From BED using ``loadBed`` (Scala only) - From GFF3 using ``loadGff3`` (Scala only) @@ -100,14 +100,14 @@ With an ``ADAMContext``, you can load: - From partitioned Parquet using ``loadPartitionedParquetFeatures`` (Scala only) - Autodetected from any of the above using ``loadFeatures`` (Scala, Java, Python, and R) -- Fragmented contig sequence as a ``NucleotideContigFragmentRDD``: +- Fragmented contig sequence as a ``NucleotideContigFragmentDataset``: - From FASTA with ``loadFasta`` (Scala only) - From Parquet with ``loadParquetContigFragments`` (Scala only) - From partitioned Parquet with ``loadPartitionedParquetContigFragments`` (Scala only) - Autodetected from either of the above using ``loadSequences`` (Scala, Java, Python, and R) -- Coverage data as a ``CoverageRDD``: +- Coverage data as a ``CoverageDataset``: - From Parquet using ``loadParquetCoverage`` (Scala only) - From Parquet or any of the feature file formats using ``loadCoverage`` (Scala only) diff --git a/docs/api/genomicRdd.rst b/docs/api/genomicDataset.rst similarity index 78% rename from docs/api/genomicRdd.rst rename to docs/api/genomicDataset.rst index a30531b2bb..4f108ac366 100644 --- a/docs/api/genomicRdd.rst +++ b/docs/api/genomicDataset.rst @@ -1,10 +1,10 @@ -Working with genomic data using GenomicRDDs -------------------------------------------- +Working with genomic data using GenomicDatasets +----------------------------------------------- As described in the section on using the `ADAMContext `__, ADAM loads genomic data into a -``GenomicRDD`` which is specialized for each datatype. This -``GenomicRDD`` wraps Apache Spark's Resilient Distributed Dataset (RDD, +``GenomicDataset`` which is specialized for each datatype. This +``GenomicDataset`` wraps Apache Spark's Resilient Distributed Dataset (RDD, (Zaharia et al. 2012)) API with genomic metadata. The ``RDD`` abstraction presents an array of data which is distributed across a cluster. ``RDD``\ s are backed by a computational lineage, which allows @@ -16,19 +16,19 @@ Around an ``RDD``, ADAM adds metadata which describes the genome, samples, or read group that a dataset came from. Specifically, ADAM supports the following metadata: -- ``GenomicRDD`` base: A sequence dictionary, which describes the +- ``GenomicDataset`` base: A sequence dictionary, which describes the reference assembly that data are aligned to, if it is aligned. Applies to all types. -- ``MultisampleGenomicRDD``: Adds metadata about the samples in a - dataset. Applies to ``GenotypeRDD``. -- ``ReadGroupGenomicRDD``: Adds metadata about the read groups attached - to a dataset. Applies to ``AlignmentRecordRDD`` and ``FragmentRDD``. +- ``MultisampleGenomicDataset``: Adds metadata about the samples in a + dataset. Applies to ``GenotypeDataset``. +- ``ReadGroupGenomicDataset``: Adds metadata about the read groups attached + to a dataset. Applies to ``AlignmentRecordDataset`` and ``FragmentDataset``. -Additionally, ``GenotypeRDD``, ``VariantRDD``, and ``VariantContextRDD`` +Additionally, ``GenotypeDataset``, ``VariantDataset``, and ``VariantContextDataset`` store the VCF header lines attached to the original file, to enable a round trip between Parquet and VCF. -``GenomicRDD``\ s can be transformed several ways. These include: +``GenomicDataset``\ s can be transformed several ways. These include: - The `core preprocessing <../algorithms/reads.html>`__ algorithms in ADAM: - Reads: @@ -42,32 +42,32 @@ round trip between Parquet and VCF. - `Mark duplicate fragments <../algorithms/dm.html>`__ -- `RDD transformations <#transforming-genomicrdds>`__ -- `Spark SQL transformations <#transforming-genomicrdds-via-spark-sql>`__ +- `Genomic dataset transformations <#transforming-genomicdatasets>`__ +- `Spark SQL transformations <#transforming-genomicdatasets-via-spark-sql>`__ - `By using ADAM to pipe out to another tool `__ -Transforming GenomicRDDs +Transforming GenomicDatasets ~~~~~~~~~~~~~~~~~~~~~~~~ -Although ``GenomicRDD``\ s do not extend Apache Spark's ``RDD`` class, +Although ``GenomicDataset``\ s do not extend Apache Spark's ``RDD`` class, ``RDD`` operations can be performed on them using the ``transform`` method. Currently, we only support ``RDD`` to ``RDD`` transformations -that keep the same type as the base type of the ``GenomicRDD``. To apply +that keep the same type as the base type of the ``GenomicDataset``. To apply an ``RDD`` transform, use the ``transform`` method, which takes a function mapping one ``RDD`` of the base type into another ``RDD`` of the base type. For example, we could use ``transform`` on an -``AlignmentRecordRDD`` to filter out reads that have a low mapping +``AlignmentRecordDataset`` to filter out reads that have a low mapping quality, but we cannot use ``transform`` to translate those reads into ``Feature``\ s showing the genomic locations covered by reads. -If we want to transform a ``GenomicRDD`` into a new ``GenomicRDD`` that +If we want to transform a ``GenomicDataset`` into a new ``GenomicDataset`` that contains a different datatype (e.g., reads to features), we can instead use the ``transmute`` function. The ``transmute`` function takes a function that transforms an ``RDD`` of the type of the first -``GenomicRDD`` into a new ``RDD`` that contains records of the type of -the second ``GenomicRDD``. Additionally, it takes an implicit function -that maps the metadata in the first ``GenomicRDD`` into the metadata -needed by the second ``GenomicRDD``. This is akin to the implicit +``GenomicDataset`` into a new ``RDD`` that contains records of the type of +the second ``GenomicDataset``. Additionally, it takes an implicit function +that maps the metadata in the first ``GenomicDataset`` into the metadata +needed by the second ``GenomicDataset``. This is akin to the implicit function required by the `pipe <#pipes.html>`__ API. As an example, let us use the ``transmute`` function to make features corresponding to reads containing INDELs: @@ -80,11 +80,11 @@ containing INDELs: val reads = sc.loadAlignments("path/to/my/reads.adam") // the type of the transmuted RDD normally needs to be specified - // import the FeatureRDD, which is the output type - import org.bdgenomics.adam.rdd.feature.FeatureRDD + // import the FeatureDataset, which is the output type + import org.bdgenomics.adam.rdd.feature.FeatureDataset import org.bdgenomics.formats.avro.Feature - val features: FeatureRDD = reads.transmute(rdd => { + val features: FeatureDataset = reads.transmute(rdd => { rdd.filter(r => { // does the CIGAR for this read contain an I or a D? Option(r.getCigar) @@ -99,12 +99,12 @@ containing INDELs: }) ``ADAMContext`` provides the implicit functions needed to run the -``transmute`` function between all ``GenomicRDD``\ s contained within +``transmute`` function between all ``GenomicDataset``\ s contained within the ``org.bdgenomics.adam.rdd`` package hierarchy. Any custom -``GenomicRDD`` can be supported by providing a user defined conversion +``GenomicDataset`` can be supported by providing a user defined conversion function. -Transforming GenomicRDDs via Spark SQL +Transforming GenomicDatasets via Spark SQL ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Spark SQL introduced the strongly-typed @@ -120,7 +120,7 @@ substantial speedups for certain queries. To resolve this, we added an ``adam-codegen`` package that generates Spark SQL compatible classes representing the ADAM schemas. These classes are available in the ``org.bdgenomics.adam.sql`` package. All -Avro-backed GenomicRDDs now support translation to Datasets via the +Avro-backed GenomicDatasets now support translation to Datasets via the ``dataset`` field, and transformation via the Spark SQL APIs through the ``transformDataset`` method. As an optimization, we lazily choose either the RDD or Dataset API depending on the calculation being performed. For @@ -150,19 +150,19 @@ an implementation note necessary only for those bypassing the ADAM APIs. Similar to ``transform``/``transformDataset``, there exists a ``transmuteDataset`` function that enables transformations between -``GenomicRDD``\ s of different types. +``GenomicDataset``\ s of different types. Using partitioned Parquet to speed up range based queries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -GenomicRDDs of types ``AlignmentRecordRDD``, ``GenotypeRDD``, -``VariantRDD``, and ``NucleotideFragmentContigRDD`` can be written as Parquet -using a Hive-style hierarchical directory scheme that is based on contig and +GenomicDatasets of types ``AlignmentRecordDataset``, ``GenotypeDataset``, +``VariantDataset``, and ``NucleotideFragmentContigDataset`` can be written as Parquet +using a Hive-style hierarchical directory scheme that is based on contig and genomic position. This partitioning reduces the latency of genomic range queries against these datasets, which is particularly important for interactive applications such as a genomic browser backed by an ADAM dataset. -The genomicRDD function -``GenomicRDD.filterByOverlappingRegions(queryRegionsList)`` builds a Spark SQL +The GenomicDataset function +``GenomicDataset.filterByOverlappingRegions(queryRegionsList)`` builds a Spark SQL query that uses this partitioning scheme. This can reduce latencies by more than 20x when repeatedly querying a datset with genomic range filters. On a high coverage alignment dataset, this partitioning strategy improved @@ -170,8 +170,8 @@ latency from 1-2 minutes to 1-3 seconds when looking up genomic ranges. **Saving partitioned parquet files to disk** -A ``GenomicRDD`` can be written to disk as a partitioned Parquet dataset with the -``GenomicRDD`` function ``saveAsPartitionedParquet``. The optional +A ``GenomicDataset`` can be written to disk as a partitioned Parquet dataset with the +``GenomicDataset`` function ``saveAsPartitionedParquet``. The optional ``partitionSize`` parameter defines the width in base pairs of the partitions within each contig. @@ -185,7 +185,7 @@ ADAM ``transformGenotypes`` CLI. **Loading partitioned parquet files** -A GenomicRDD can be loaded from a partitioned Parquet dataset using the +A GenomicDataset can be loaded from a partitioned Parquet dataset using the ADAMContext function ``loadPartitionedParquet[*]`` specific to each data type such as ``loadPartitionedParquetAlignments``. @@ -234,15 +234,15 @@ by the Parquet files' partitioning scheme, and makes ``positionBin`` available a that can be queried through the Spark SQL API. ``positionBin`` is used internally by the public function ``GenomicRDD.filterByOverlappingRegions``. User code in ADAM-shell or user applications could similarly utilize the ``positionBin`` field when creating Spark -SQL queries on a ``genomicRDD.dataset`` backed by partitioned Parquet. +SQL queries on a ``genomicDataset.dataset`` backed by partitioned Parquet. **Re-using a previously loaded partitioned dataset:** When a partitioned dataset is first created within an ADAM session, a partition -discovery/initialization step is performed that can take several minutes for large datasets. -The original GenomicRDD object can then be re-used multiple times as the parent +discovery/initialization step is performed that can take several minutes for large datasets. +The original GenomicDataset object can then be re-used multiple times as the parent of different filtration and processing transformations and actions, without incurring -this initializiation cost again. Thus, re-use of a parent partitioned ``GenomicRDD`` +this initializiation cost again. Thus, re-use of a parent partitioned ``GenomicDataset`` is key to realizing the latency advantages of partitioned datasets described above. .. code:: scala diff --git a/docs/api/joins.rst b/docs/api/joins.rst index 8a8c2e4e70..943e793d9f 100644 --- a/docs/api/joins.rst +++ b/docs/api/joins.rst @@ -47,7 +47,7 @@ To perform a BroadcastRegionJoin, use the following: dataset1.broadcastRegionJoin(dataset2) -Where ``dataset1`` and ``dataset2`` are ``GenomicRDD``\ s. If you used +Where ``dataset1`` and ``dataset2`` are ``GenomicDataset``\ s. If you used the ADAMContext to read a genomic dataset into memory, this condition is met. @@ -72,10 +72,10 @@ data, and all are called in a similar way: - Right outer join and group by right -Given two RDDs +Given two GenomicDatasets .. figure:: img/join_rdds.png - :alt: RDDs for Joins + :alt: Genomic Datasets for Joins A subset of these joins are depicted below. @@ -84,8 +84,8 @@ A subset of these joins are depicted below. One common pattern involves joining a single dataset against many -datasets. An example of this is joining an RDD of features (e.g., -gene/exon coordinates) against many different RDDs of reads. If the +datasets. An example of this is joining an genomic dataset of features (e.g., +gene/exon coordinates) against many different genomic datasets of reads. If the object that is being used many times (gene/exon coordinates, in this case), we can force that object to be broadcast once and reused many times with the ``broadcast()`` function. This pairs with the @@ -123,7 +123,7 @@ expand from. Filter Genotypes by Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This query joins an RDD of Genotypes against an RDD of Features using an +This query joins a genomic dataset of Genotypes against a genomic dataset of Features using an inner join. Because this is an inner join, records from either dataset that do not pair to the other are automatically dropped, providing the filter we are interested in. This query is useful for trying to identify @@ -160,12 +160,12 @@ smaller in size than genotypic data. Group overlapping variant data by the gene they overlap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This query joins an RDD of Variants against an RDD of Features, and -immediately performs a group-by on the Feature. This produces an RDD +This query joins a genomic dataset of Variants against a genomic dataset of Features, and +immediately performs a group-by on the Feature. This produces a genomic dataset whose elements are a tuple containing a Feature, and all of the Variants overlapping the Feature. This produces an RDD whose elements are tuples containing a Feature and all of the Variants overlapping the -Feature.This query is useful for trying to identify annotated variants +Feature. This query is useful for trying to identify annotated variants that may interact (identifying frameshift mutations within a transcript that may act as a pair to shift and then restore the reading frame) or as the start of a query that computes variant density over a set of @@ -198,7 +198,7 @@ optimize by combining the join and group-by. Separate reads into overlapping and non-overlapping features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This query joins an RDD of reads with an RDD of features using an outer +This query joins a genomic dataset of reads with a genomic dataset of features using an outer join. The outer join will produce an RDD where each read is optionally mapped to a feature. If a given read does not overlap with any features provided, it is paired with a ``None``. After we perform the join, we diff --git a/docs/api/overview.rst b/docs/api/overview.rst index 84314a2303..9106771d9a 100644 --- a/docs/api/overview.rst +++ b/docs/api/overview.rst @@ -3,11 +3,11 @@ API Overview The main entrypoint to ADAM is the `ADAMContext `__, which allows genomic data to be loaded in to Spark as -`GenomicRDD `__. GenomicRDDs can be transformed using +`GenomicDataset `__. GenomicDatasets can be transformed using ADAM's built in `pre-processing algorithms <../algorithms/reads.html>`__, `Spark's -RDD primitives `__, the `region join `__ -primitive, and ADAM's `pipe `__ APIs. GenomicRDDs can also be -interacted with as `Spark SQL tables `__. +RDD primitives `__, the `region join `__ +primitive, and ADAM's `pipe `__ APIs. GenomicDatasets can also be +interacted with as `Spark SQL tables `__. In addition to the Scala/Java API, ADAM can be used from `Python <#the-adam-python-api>`__ and `R <#the-adam-r-api>`__. @@ -48,7 +48,7 @@ The ADAM Python API ------------------- ADAM's Python API wraps the `ADAMContext `__ and -`GenomicRDD `__ APIs so they can be used from PySpark. The +`GenomicDataset `__ APIs so they can be used from PySpark. The Python API is feature complete relative to ADAM's Java API. `Read more about the Python API. `__ @@ -57,5 +57,5 @@ The ADAM R API -------------- ADAM's R API wraps the `ADAMContext `__ and -`GenomicRDD `__ APIs so they can be used from SparkR. The +`GenomicDataset `__ APIs so they can be used from SparkR. The R API is feature complete relative to ADAM's Java API. diff --git a/docs/api/pipes.rst b/docs/api/pipes.rst index 2458f51ff5..1c18f04f35 100644 --- a/docs/api/pipes.rst +++ b/docs/api/pipes.rst @@ -1,7 +1,7 @@ Using ADAM's Pipe API --------------------- -ADAM's ``GenomicRDD`` API provides support for piping the underlying +ADAM's ``GenomicDataset`` API provides support for piping the underlying genomic data out to a single node process through the use of a ``pipe`` API. This builds off of Apache Spark's ``RDD.pipe`` API. However, ``RDD.pipe`` prints the objects as strings to the pipe. ADAM's pipe API @@ -22,7 +22,7 @@ The method signature of a pipe command is below: .. code:: scala - def pipe[X, Y <: GenomicRDD[X, Y], V <: InFormatter[T, U, V]](cmd: Seq[String], + def pipe[X, Y <: GenomicDataset[X, Y], V <: InFormatter[T, U, V]](cmd: Seq[String], files: Seq[String] = Seq.empty, environment: Map[String, String] = Map.empty, flankSize: Int = 0)(implicit tFormatterCompanion: InFormatterCompanion[T, U, V], @@ -32,8 +32,8 @@ The method signature of a pipe command is below: xManifest: ClassTag[X]): Y ``X`` is the type of the records that are returned (e.g., for reads, -``AlignmentRecord``) and ``Y`` is the type of the ``GenomicRDD`` that is -returned (e.g., for reads, ``AlignmentRecordRDD``). As explicit +``AlignmentRecord``) and ``Y`` is the type of the ``GenomicDatset`` that is +returned (e.g., for reads, ``AlignmentRecordDataset``). As explicit parameters, we take: - ``cmd``: The command to run. @@ -50,12 +50,12 @@ parameters, we take: Additionally, we take several important implicit parameters: - ``tFormatter``: The ``InFormatter`` that converts the data that is - piped into the run command from the underlying ``GenomicRDD`` type. + piped into the run command from the underlying ``GenomicDataset`` type. - ``xFormatter``: The ``OutFormatter`` that converts the data that is piped out of the run command back to objects for the output - ``GenomicRDD``. + ``GenomicDataset``. - ``convFn``: A function that applies any necessary metadata - conversions and creates a new ``GenomicRDD``. + conversions and creates a new ``GenomicDataset``. The ``tManifest`` and ``xManifest`` implicit parameters are `Scala ClassTag `__\ s @@ -67,45 +67,45 @@ reads can be saved to or read from BAM, CRAM, FASTQ, and SAM). The ``InFormatter`` and ``OutFormatter`` parameters specify the format that is being read into or out of the pipe. We support the following: -- ``AlignmentRecordRDD``: +- ``AlignmentRecordDataset``: - ``InFormatter``\ s: ``SAMInFormatter`` and ``BAMInFormatter`` write SAM or BAM out to a pipe. - ``OutFormatter``: ``AnySAMOutFormatter`` supports reading SAM and BAM from a pipe, with the exact format autodetected from the stream. - We do not support piping CRAM due to complexities around the reference-based compression. -- ``FeatureRDD``: +- ``FeatureDataset``: - ``InFormatter``\ s: ``BEDInFormatter``, ``GFF3InFormatter``, ``GTFInFormatter``, and ``NarrowPeakInFormatter`` for writing features out to a pipe in BED, GFF3, GTF/GFF2, or NarrowPeak format, respectively. - ``OutFormatter``\ s: ``BEDOutFormatter``, ``GFF3OutFormatter``, ``GTFOutFormatter``, and ``NarrowPeakInFormatter`` for reading features in BED, GFF3, GTF/GFF2, or NarrowPeak format in from a pipe, respectively. -- ``FragmentRDD``: +- ``FragmentDataset``: - ``InFormatter``: ``InterleavedFASTQInFormatter`` writes FASTQ with the reads from a paired sequencing protocol interleaved in the FASTQ stream to a pipe. -- ``VariantContextRDD``: +- ``VariantContextDataset``: - ``InFormatter``: ``VCFInFormatter`` writes VCF to a pipe. - ``OutFormatter``: ``VCFOutFormatter`` reads VCF from a pipe. The ``convFn`` implementations are provided as implicit values in the `ADAMContext `__. These conversion functions are needed -to adapt the metadata stored in a single ``GenomicRDD`` to the type of a -different ``GenomicRDD`` (e.g., if piping an ``AlignmentRecordRDD`` -through a command that returns a ``VariantContextRDD``, we will need to -convert the ``AlignmentRecordRDD``\ s ``RecordGroupDictionary`` into an -array of ``Sample``\ s for the ``VariantContextRDD``). We provide four +to adapt the metadata stored in a single ``GenomicDataset`` to the type of a +different ``GenomicDataset`` (e.g., if piping an ``AlignmentRecordDataset`` +through a command that returns a ``VariantContextDataset``, we will need to +convert the ``AlignmentRecordDataset``\ s ``RecordGroupDictionary`` into an +array of ``Sample``\ s for the ``VariantContextDataset``). We provide four implementations: - ``ADAMContext.sameTypeConversionFn``: For piped commands that do not - change the type of the ``GenomicRDD`` (e.g., ``AlignmentRecordRDD`` → - ``AlignmentRecordRDD``). + change the type of the ``GenomicDataset`` (e.g., ``AlignmentRecordDataset`` → + ``AlignmentRecordDataset``). - ``ADAMContext.readsToVCConversionFn``: For piped commands that go - from an ``AlignmentRecordRDD`` to a ``VariantContextRDD``. + from an ``AlignmentRecordDataset`` to a ``VariantContextDataset``. - ``ADAMContext.fragmentsToReadsConversionFn``: For piped commands that - go from a ``FragmentRDD`` to an ``AlignmentRecordRDD``. + go from a ``FragmentDataset`` to an ``AlignmentRecordDataset``. To put everything together, here is an example command. Here, we will run a command ``my_variant_caller``, which accepts one argument @@ -114,7 +114,7 @@ standard output: .. code:: scala - // import RDD load functions and conversion functions + // import genomic dataset load functions and conversion functions import org.bdgenomics.adam.rdd.ADAMContext._ // import functionality for piping SAM into pipe @@ -123,7 +123,7 @@ standard output: // import functionality for reading VCF from pipe import org.bdgenomics.adam.converters.DefaultHeaderLines import org.bdgenomics.adam.rdd.variant.{ - VariantContextRDD, + VariantContextDataset, VCFOutFormatter } @@ -138,9 +138,9 @@ standard output: implicit val uFormatter = new VCFOutFormatter(DefaultHeaderLines.allHeaderLines) // run the piped command - // providing the explicit return type (VariantContextRDD) will ensure that + // providing the explicit return type (VariantContextDataset) will ensure that // the correct implicit convFn is selected - val variantContexts: VariantContextRDD = reads.pipe( + val variantContexts: VariantContextDataset = reads.pipe( cmd = Seq("my_variant_caller", "-R", "$0"), files = Seq("hdfs://mynamenode/my/reference/genome.fa")) @@ -185,15 +185,15 @@ To run the Scala example code above using Java, we would write: import java.util.List; import java.util.Map; import org.bdgenomics.adam.models.VariantContext - import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD; + import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset; import org.bdgenomics.adam.rdd.read.SAMInFormatter; - import org.bdgenomics.adam.rdd.variant.VariantContextRDD; + import org.bdgenomics.adam.rdd.variant.VariantContextDataset; import org.bdgenomics.adam.rdd.variant.VCFOutFormatter; import org.bdgenomics.adam.api.java.AlignmentRecordToVariantContextConverter; class PipeRunner { - VariantContextRDD runPipe(AlignmentRecordRDD reads) { + VariantContextDataset runPipe(AlignmentRecordDataset reads) { List cmd = new ArrayList(); cmd.add("my_variant_caller"); @@ -206,7 +206,7 @@ To run the Scala example code above using Java, we would write: Map env = new HashMap(); return reads.pipe(cmd, files, env, diff --git a/docs/architecture/evidence.rst b/docs/architecture/evidence.rst index 131e639706..b86fdba9ff 100644 --- a/docs/architecture/evidence.rst +++ b/docs/architecture/evidence.rst @@ -6,24 +6,24 @@ ADAM exposes access to distributed datasets of genomic data through the Spark's SparkContext, which tracks the configuration and state of the current running Spark application. On top of the SparkContext, the ADAMContext provides data loading functions which yield -`GenomicRDD <../api/genomicRdd.html>`__\ s. The GenomicRDD classes provide a +`GenomicDataset <../api/genomicDataset.html>`__\ s. The GenomicDataset classes provide a wrapper around Apache Spark's two APIs for manipulating distributed datasets: the legacy Resilient Distributed Dataset (Zaharia et al. 2012) and the new Spark SQL Dataset/DataFrame API (Armbrust et al. 2015). -Additionally, the GenomicRDD is enriched with genomics-specific metadata +Additionally, the GenomicDataset is enriched with genomics-specific metadata such as computational lineage and sample metadata, and optimized genomics-specific query patterns such as `region joins <../api/joins.html>`__ and the `auto-parallelizing pipe API <../api/pipes.html>`__ for running legacy tools using Apache Spark. .. figure:: img/grdd.png - :alt: The GenomicRDD Class Hierarchy + :alt: The GenomicDataset Class Hierarchy - The GenomicRDD Class Hierarchy + The GenomicDataset Class Hierarchy -All GenomicRDDs include a sequence dictionary which describes the -reference genome that the data in the RDD are aligned to, if one is -known. Additionally, RecordGroupGenomicRDD store a dictionary with read +All GenomicDatasets include a sequence dictionary which describes the +reference genome that the data in the genomic dataset are aligned to, if one is +known. Additionally, RecordGroupGenomicDataset store a dictionary with read groups that are attached to the reads/fragments. Similarly, the -MultisampleGenomicRDD includes a list of samples who are present in the +MultisampleGenomicDataset includes a list of samples who are present in the dataset. diff --git a/docs/architecture/overview.rst b/docs/architecture/overview.rst index 848ff530ed..0e49934c7e 100644 --- a/docs/architecture/overview.rst +++ b/docs/architecture/overview.rst @@ -53,8 +53,8 @@ architectures impose significant restrictions, including: query optimizations. At the core of ADAM, users use the `ADAMContext <../api/adamContext.html>`__ to -load data as `GenomicRDDs <../api/genomicRdd.html>`__, which they can then -manipulate. In the GenomicRDD class hierarchy, we provide several +load data as `GenomicDatasets <../api/genomicDataset.html>`__, which they can then +manipulate. In the GenomicDataset class hierarchy, we provide several classes that contain functionality that is applicable to all genomic datatypes, such as `coordinate-space joins <../api/joins.html>`__, the `pipe <../api/pipes.html>`__ API, and genomic metadata management. diff --git a/docs/architecture/stackModel.rst b/docs/architecture/stackModel.rst index 567d5cf377..e87d7c50b7 100644 --- a/docs/architecture/stackModel.rst +++ b/docs/architecture/stackModel.rst @@ -46,10 +46,10 @@ these are: Spark SQL (Armbrust et al. 2015) for evidence access and query. 6. The *presentation* layer provides high level abstractions for interacting with a parallel collection of genomic data. In ADAM, we - implement this layer through the `GenomicRDD <../api/genomicRdd.html>`__ + implement this layer through the `GenomicDataset <../api/genomicDataset.html>`__ classes. This layer presents users with a view of the metadata associated with a collection of genomic data, and APIs for - `transforming <../api/genomicRdd.html#transforming-genomicrdds>`__ and + `transforming <../api/genomicRdd.html#transforming-genomicdatasets>`__ and `joining <../api/joins.html>`__ genomic data. Additionally, this is the layer where we provide cross-language support. 7. The *application* layer is the layer where a user writes their diff --git a/docs/index.rst b/docs/index.rst index 93ff88d41d..6bcc3dea83 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -99,7 +99,7 @@ For more, please see our `awesome list of applications