From b72af8775a546bb252c67ff064d20fa9b033074b Mon Sep 17 00:00:00 2001 From: Cotton Seed Date: Sun, 15 Nov 2015 16:38:43 -0500 Subject: [PATCH 01/15] Improved filter tests. add VariantInfo to VSM and make things compile Delete TupleVSM Delete RecordReader add VariantInfo to VSM and make things compile Delete TupleVSM Delete RecordReader Map[String, String] VariantInfo stores Map[String, String] Added contigLength map to LoadVCF -- was a TODO item Code compiling Annotation filters functional from within FilterVariant Commit to be squashed Annotations updates placeholder Create AnnotationSignature abstract class and SimpleSignature implementation for most info field value -- tested, working Create AnnotationSignature abstract class and SimpleSignature implementation for most info field value -- tested, working Added ExportVariants and ExportSamples commands Added ExportVariants and ExportSamples commands Added ExportVariants and ExportSamples commands --- .../scala/org/broadinstitute/hail/Utils.scala | 1 - .../annotations/AnnotationSignature.scala | 7 + .../hail/annotations/Annotations.scala | 167 ++++++++++++++++++ .../hail/annotations/SimpleSignature.scala | 11 ++ .../hail/annotations/StupidAnnotation.scala | 7 + .../hail/annotations/package.scala | 6 + .../hail/driver/ExportSamples.scala | 68 +++++++ .../hail/driver/ExportVariants.scala | 68 +++++++ .../hail/driver/FilterGenotypes.scala | 28 +-- .../hail/driver/FilterSamples.scala | 10 +- .../hail/driver/FilterVariants.scala | 11 +- .../broadinstitute/hail/driver/SampleQC.scala | 135 +++++++++++--- .../hail/driver/VariantQC.scala | 129 +++++++++++--- .../hail/methods/ExportTSV.scala | 49 +++++ .../broadinstitute/hail/methods/Filter.scala | 101 +++++++++-- .../hail/methods/LinearRegression.scala | 2 +- .../broadinstitute/hail/methods/LoadVCF.scala | 89 +++++++++- .../hail/variant/Genotype.scala | 9 +- .../broadinstitute/hail/variant/RichRow.scala | 7 + .../hail/variant/VariantMetadata.scala | 26 ++- .../hail/variant/VariantSampleMatrix.scala | 140 +++++++++++---- .../hail/vcf/HtsjdkRecordReader.scala | 59 +++++-- .../hail/annotations/AnnotationsSuite.scala | 79 +++++++++ .../hail/utils/TestRDDBuilder.scala | 5 +- .../hail/variant/vsm/VSMSuite.scala | 2 +- 25 files changed, 1070 insertions(+), 146 deletions(-) create mode 100644 src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala create mode 100644 src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala create mode 100644 src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala create mode 100644 src/main/scala/org/broadinstitute/hail/annotations/StupidAnnotation.scala create mode 100644 src/main/scala/org/broadinstitute/hail/annotations/package.scala create mode 100644 src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala create mode 100644 src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala create mode 100644 src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala create mode 100644 src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala diff --git a/src/main/scala/org/broadinstitute/hail/Utils.scala b/src/main/scala/org/broadinstitute/hail/Utils.scala index c68145ec647..07bc4dbf316 100644 --- a/src/main/scala/org/broadinstitute/hail/Utils.scala +++ b/src/main/scala/org/broadinstitute/hail/Utils.scala @@ -489,7 +489,6 @@ object Utils { def D_>=(a: Double, b: Double, tolerance: Double = 1.0E-6): Boolean = a - b >= -D_epsilon(a, b, tolerance) - def flushDouble(a: Double): Double = if (math.abs(a) < java.lang.Double.MIN_NORMAL) 0.0 else a diff --git a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala new file mode 100644 index 00000000000..8754baa86fe --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala @@ -0,0 +1,7 @@ +package org.broadinstitute.hail.annotations + +abstract class AnnotationSignature { + def buildCaseClasses: String + def conversion: String + def getType: String +} diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala new file mode 100644 index 00000000000..a12072d2788 --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -0,0 +1,167 @@ +package org.broadinstitute.hail.annotations + +case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T]) extends Serializable { + + def nAttrs: Int = { + var i = 0 + maps.foreach { + case (id, m) => + i += m.size + } + i += vals.size + i + } + + def hasMap(str: String): Boolean = maps.contains(str) + + def contains(str: String): Boolean = vals.contains(str.toLowerCase) + + def contains(parent: String, str: String): Boolean = hasMap(parent) && maps(parent).contains(str) + + def get(str: String): Option[T] = vals.get(str) + + def get(parent: String, str: String): Option[T] = { + if (!hasMap(parent)) + None + else + maps(parent).get(str) + } + + def getOrElse(parent: String, str: String, default: T): T = { + if (!hasMap(parent) || !contains(parent, str)) + default + else + maps(parent)(str) + } + + def getOrElse(str: String, default: T): T = { + if (!contains(str)) + default + else + vals(str) + } + + def addMap(name: String, m: Map[String, T]): Annotations[T] = { + Annotations(maps + .-(name) + .+((name, m)), vals) + } + + def addMaps(newMaps: Map[String, Map[String, T]]): Annotations[T] = { + Annotations(maps + .--(newMaps.keys) + .++(newMaps), vals) + } + + def addVal(name: String, mapping: T): Annotations[T] = { + Annotations(maps, vals + .-(name) + .+((name, mapping))) + } + + def addVals(newVals: Map[String, T]): Annotations[T] = { + Annotations(maps, vals + .--(newVals.keys) + .++(newVals)) + } + + def equals(other: Annotations[T]): Boolean = { + vals.forall { case (k, v) => other.vals.contains(k) && other.vals(k) == v} && + maps.forall { case (mName, m) => other.maps.contains(mName) && m.forall { + case (k, v) => other.maps(mName).contains(k) && other.maps(mName)(k) == v }} + } +} + +object EmptyAnnotationSignatures { + def apply(): AnnotationSignatures = { + Annotations(Map.empty[String, Map[String, AnnotationSignature]], Map.empty[String, AnnotationSignature]) + } +} + +object EmptyAnnotations { + def apply(): AnnotationData = { + Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String]) + } +} + +object EmptySampleAnnotations { + def apply(nSamples: Int): Array[AnnotationData] = { + (0 until nSamples) + .map(i => Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String])) + .toArray + } +} + +object AnnotationUtils { + + def annotationToString(ar: AnyRef): String = { + ar match { + case iter: Iterable[_] => iter.map(_.toString).reduceRight(_ + ", " + _) + case _ => ar.toString + } + } + + def parseAnnotationType(str: String): String = { + str match { + case "Flag" => "Boolean" + case "Integer" => "Int" + case "Float" => "Double" + case "String" => "String" + case _ => throw new UnsupportedOperationException("unexpected annotation type") + } + } +} + +object AnnotationClassBuilder { + + def signatures(sigs: AnnotationSignatures, hiddenClassName: String): String = { + val internalClasses = sigs.maps.map { + case (subclass, subMap) => + s"class __${subclass}Annotations(subMap: Map[String, String]) extends Serializable {\n" + + subMap.map { case (k, sig) => +// s""" val $k: $kType = subMap.getOrElse("$k", \"false\").$kMethod\n""" + val default = getDefault(sig.getType) + s""" val $k: ${sig.getType} = subMap.getOrElse("$k", "$default").${sig.conversion}\n""" + } + .foldRight[String]("")(_ + _) + "}\n" + } + .foldRight[String]("")(_ + _) + + val hiddenClass = s"class ${hiddenClassName}Annotations" + + s"(annot: org.broadinstitute.hail.annotations.AnnotationData) extends Serializable {\n" + + sigs.maps.map { case (subclass, subMap) => + s""" val $subclass = new __${subclass}Annotations(annot.maps(\"$subclass\"))\n""" } + .foldRight[String]("")(_ + _) + + sigs.vals.map { case (k, sig) => + val default = getDefault(sig.getType) + s""" val $k: ${sig.getType} = annot.vals.getOrElse("$k", "$default").${sig.conversion} \n""" + } + .foldRight[String]("")(_ + _) + "}\n" + + "\n" + internalClasses + hiddenClass + } + + def instantiate(exposedName: String, hiddenClassName: String): String = { + s"val $exposedName = new ${hiddenClassName}Annotations($hiddenClassName)\n" + } + + def makeArray(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = { + s"val $hiddenOutputName: Array[${hiddenClassName}Annotations] = " + + "$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_))\n" + } + + val arrayRegex = """Array\[(\w+)\]""".r + val optionRegex = """Option\[(\w+)\]""".r + private def getDefault(typeStr: String): String = { + if (typeStr == "Int" || typeStr == "Double") + "0" + else if (typeStr == "Boolean") + "false" + else + typeStr match { + case optionRegex(subType) => "None" + case arrayRegex(subType) => getDefault(subType) + case _ => "" + } + } +} \ No newline at end of file diff --git a/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala new file mode 100644 index 00000000000..2037358c290 --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala @@ -0,0 +1,11 @@ +package org.broadinstitute.hail.annotations + +class SimpleSignature(scalaType: String, conversionMethod: String, description: String) + extends AnnotationSignature { + + def buildCaseClasses: String = "" + + def conversion: String = conversionMethod + + def getType: String = scalaType +} diff --git a/src/main/scala/org/broadinstitute/hail/annotations/StupidAnnotation.scala b/src/main/scala/org/broadinstitute/hail/annotations/StupidAnnotation.scala new file mode 100644 index 00000000000..e62413e854f --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/annotations/StupidAnnotation.scala @@ -0,0 +1,7 @@ +package org.broadinstitute.hail.annotations + +class StupidAnnotation() extends AnnotationSignature { + def buildCaseClasses: String = throw new UnsupportedOperationException + def conversion: String = throw new UnsupportedOperationException + def getType: String = throw new UnsupportedOperationException +} diff --git a/src/main/scala/org/broadinstitute/hail/annotations/package.scala b/src/main/scala/org/broadinstitute/hail/annotations/package.scala new file mode 100644 index 00000000000..e91f1a8313d --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/annotations/package.scala @@ -0,0 +1,6 @@ +package org.broadinstitute.hail + +package object annotations { + type AnnotationSignatures = Annotations[AnnotationSignature] + type AnnotationData = Annotations[String] +} diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala new file mode 100644 index 00000000000..f0c3bc7b6fd --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala @@ -0,0 +1,68 @@ +package org.broadinstitute.hail.driver + +import org.broadinstitute.hail.Utils._ +import org.broadinstitute.hail.methods._ +import org.broadinstitute.hail.variant._ +import org.broadinstitute.hail.annotations._ +import org.kohsuke.args4j.{Option => Args4jOption} + +object ExportSamples extends Command { + + class Options extends BaseOptions { + + @Args4jOption(required = true, name = "-o", aliases = Array("--output"), + usage = "path of output tsv") + var output: String = _ + + @Args4jOption(required = true, name = "-c", aliases = Array("--condition"), + usage = "Comma-separated list of fields to be printed to tsv") + var condition: String = _ + + @Args4jOption(required = false, name = "--missing", + usage = "Format of missing values (Default: 'NA')") + var missing = "NA" + } + + def newOptions = new Options + + def name = "exportsamples" + + def description = "Export list of sample information to tsv" + + def run(state: State, options: Options): State = { + val vds = state.vds + + val cond = options.condition + + val output = options.output + + val sas = vds.metadata.sampleAnnotationSignatures + val makeString: (Sample, Annotations[String]) => String = { + try { + val ese = new ExportSamplesEvaluator(cond, sas, options.missing) + ese.typeCheck() + ese.apply + } catch { + case e: scala.tools.reflect.ToolBoxError => + /* e.message looks like: + reflective compilation has failed: + + ';' expected but '.' found. */ + fatal("parse error in condition: " + e.message.split("\n").last) + } + } + + writeTextFile(output + ".header", state.hadoopConf) { s => + s.write(cond.split(",").reduceRight(_ + "\t" + _)) + s.write("\n") + } + + hadoopDelete(output, state.hadoopConf, true) + + vds.sparkContext.parallelize(vds.sampleIds.map(Sample).zip(vds.metadata.sampleAnnotations)) + .map { case (s, sa) => makeString(s, sa)} + .saveAsTextFile(output) + + state + } +} diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala new file mode 100644 index 00000000000..3f0a9768bb8 --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala @@ -0,0 +1,68 @@ +package org.broadinstitute.hail.driver + +import org.broadinstitute.hail.Utils._ +import org.broadinstitute.hail.methods._ +import org.broadinstitute.hail.variant._ +import org.broadinstitute.hail.annotations._ +import org.kohsuke.args4j.{Option => Args4jOption} + +object ExportVariants extends Command { + + class Options extends BaseOptions { + + @Args4jOption(required = true, name = "-o", aliases = Array("--output"), + usage = "path of output tsv") + var output: String = _ + + @Args4jOption(required = true, name = "-c", aliases = Array("--condition"), + usage = "Comma-separated list of fields to be printed to tsv") + var condition: String = _ + + @Args4jOption(required = false, name = "--missing", + usage = "Format of missing values (Default: 'NA')") + var missing = "NA" + } + + def newOptions = new Options + + def name = "exportvariants" + + def description = "Export list of variant information to tsv" + + def run(state: State, options: Options): State = { + val vds = state.vds + + val cond = options.condition + + val output = options.output + + val vas = vds.metadata.variantAnnotationSignatures + val makeString: (Variant, Annotations[String]) => String = { + try { + val eve = new ExportVariantsEvaluator(cond, vas, options.missing) + eve.typeCheck() + eve.apply + } catch { + case e: scala.tools.reflect.ToolBoxError => + /* e.message looks like: + reflective compilation has failed: + + ';' expected but '.' found. */ + fatal("parse error in condition: " + e.message.split("\n").last) + } + } + + writeTextFile(output + ".header", state.hadoopConf) { s => + s.write(cond.split(",").reduceRight(_ + "\t" + _)) + s.write("\n") + } + + hadoopDelete(output, state.hadoopConf, true) + + vds.variantsAndAnnotations + .map { case (v, va) => makeString(v, va) } + .saveAsTextFile(output) + + state + } +} diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala index 72034cc72f7..69c3c8bd628 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala @@ -2,6 +2,7 @@ package org.broadinstitute.hail.driver import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.methods._ +import org.broadinstitute.hail.annotations._ import org.broadinstitute.hail.variant.{Variant, Genotype, Sample} import org.kohsuke.args4j.{Option => Args4jOption} @@ -27,18 +28,19 @@ object FilterGenotypes extends Command { def run(state: State, options: Options): State = { val vds = state.vds + val vas: AnnotationSignatures = state.vds.metadata.variantAnnotationSignatures + val sas: AnnotationSignatures = state.vds.metadata.sampleAnnotationSignatures + val sa = state.vds.metadata.sampleAnnotations if (!options.keep && !options.remove) fatal(name + ": one of `--keep' or `--remove' required") - val p: (Variant, Sample, Genotype) => Boolean = try { - val cf = new FilterGenotypeCondition(options.condition) + val p: Array[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean)) = try { + val cf = new FilterGenotypeCondition(options.condition, vas, sas, sa) cf.typeCheck() - if (options.keep) - cf.apply - else - (v: Variant, s: Sample, g: Genotype) => !cf(v, s, g) - } catch { + cf.apply + } + catch { case e: scala.tools.reflect.ToolBoxError => /* e.message looks like: reflective compilation has failed: @@ -49,12 +51,14 @@ object FilterGenotypes extends Command { val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) - val newVDS = vds.mapValuesWithKeys((v: Variant, s: Int, g: Genotype) => - if (p(v, Sample(sampleIdsBc.value(s)), g)) + //FIXME put keep/remove logic here + val newVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => + if (p(sa)(v, va)(s, Sample(sampleIdsBc.value(s)), g)) { g - else - Genotype(-1, (0, 0), 0, null)) - + } + else { + Genotype(-1, (0, 0), 0, null) + }) state.copy(vds = newVDS) } } diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala index a8ab13744e0..b64f1098ed9 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala @@ -3,6 +3,7 @@ package org.broadinstitute.hail.driver import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.methods._ import org.broadinstitute.hail.variant._ +import org.broadinstitute.hail.annotations._ import org.kohsuke.args4j.{Option => Args4jOption} import scala.io.Source @@ -29,6 +30,7 @@ object FilterSamples extends Command { def run(state: State, options: Options): State = { val vds = state.vds + val sas: AnnotationSignatures = state.vds.metadata.sampleAnnotationSignatures if (!options.keep && !options.remove) fatal(name + ": one of `--keep' or `--remove' required") @@ -42,14 +44,14 @@ object FilterSamples extends Command { .filter(line => !line.isEmpty) .map(indexOfSample) .toSet - samples.contains(_) + (s: Int, sa: AnnotationData) => samples.contains(s) case c: String => try { - val cf = new FilterSampleCondition(c) + val cf = new FilterSampleCondition(c, sas) cf.typeCheck() val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) - (s: Int) => cf(Sample(sampleIdsBc.value(s))) + (s: Int, sa: AnnotationData) => cf(Sample(sampleIdsBc.value(s)), state.vds.metadata.sampleAnnotations(s)) } catch { case e: scala.tools.reflect.ToolBoxError => /* e.message looks like: @@ -63,7 +65,7 @@ object FilterSamples extends Command { val newVDS = vds.filterSamples(if (options.keep) p else - (s: Int) => !p(s)) + (s: Int, sa: AnnotationData) => !p(s, sa)) state.copy(vds = newVDS) } diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterVariants.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterVariants.scala index f05422a047f..407032d583b 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterVariants.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterVariants.scala @@ -3,6 +3,7 @@ package org.broadinstitute.hail.driver import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.methods._ import org.broadinstitute.hail.variant._ +import org.broadinstitute.hail.annotations._ import org.kohsuke.args4j.{Option => Args4jOption} object FilterVariants extends Command { @@ -32,13 +33,14 @@ object FilterVariants extends Command { fatal(name + ": one of `--keep' or `--remove' required") val cond = options.condition - val p: (Variant) => Boolean = cond match { + val vas = vds.metadata.variantAnnotationSignatures + val p: (Variant, Annotations[String]) => Boolean = cond match { case f if f.endsWith(".interval_list") => val ilist = IntervalList.read(options.condition) - (v: Variant) => ilist.contains(v.contig, v.start) + (v: Variant, va: Annotations[String]) => ilist.contains(v.contig, v.start) case c: String => try { - val cf = new FilterVariantCondition(c) + val cf = new FilterVariantCondition(c, vas) cf.typeCheck() cf.apply } catch { @@ -54,8 +56,7 @@ object FilterVariants extends Command { val newVDS = vds.filterVariants(if (options.keep) p else - (v: Variant) => !p(v)) - + (v: Variant, va: Annotations[String]) => !p(v, va)) state.copy(vds = newVDS) } } diff --git a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala index faf1c48b789..e02ba694b19 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala @@ -1,9 +1,9 @@ package org.broadinstitute.hail.driver -import org.apache.commons.math3.distribution.BinomialDistribution import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.util.StatCounter +import org.broadinstitute.hail.annotations._ import org.broadinstitute.hail.methods._ import org.broadinstitute.hail.variant._ import org.broadinstitute.hail.Utils._ @@ -35,6 +35,38 @@ object SampleQCCombiner { "rTiTv\t" + "rHetHomVar\t" + "rDeletionInsertion" + + val signatures = Map("nCalled" -> new SimpleSignature("Int", "toInt", ""), + "nNotCalled" -> new SimpleSignature("Int", "toInt", ""), + "nHomRef" -> new SimpleSignature("Int", "toInt", ""), + "nHet" -> new SimpleSignature("Int", "toInt", ""), + "nHomVar" -> new SimpleSignature("Int", "toInt", ""), + "nSNP" -> new SimpleSignature("Int", "toInt", ""), + "nInsertion" -> new SimpleSignature("Int", "toInt", ""), + "nDeletion" -> new SimpleSignature("Int", "toInt", ""), + "nSingleton" -> new SimpleSignature("Int", "toInt", ""), + "nTransition" -> new SimpleSignature("Int", "toInt", ""), + "nTransversion" -> new SimpleSignature("Int", "toInt", ""), + "dpMean" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpStDev" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqMean" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqStDev" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "nNonRef" -> new SimpleSignature("Int", "toInt", ""), + "rTiTv" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "rHetHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "rDeletionInsertion" -> new SimpleSignature("Option[Double]", "toOptionDouble", "")) } class SampleQCCombiner extends Serializable { @@ -220,13 +252,55 @@ class SampleQCCombiner extends Serializable { // rDeletionInsertion sb.tsvAppend(divOption(nDel, nIns)) } + + def asMap: Map[String, String] = { + Map("nCalled" -> (nHomRef + nHet + nHomVar).toString, + "nNotCalled" -> nNotCalled.toString, + "nHomRef" -> nHomRef.toString, + "nHet" -> nHet.toString, + "nHomVar" -> nHomVar.toString, + "nSNP" -> nSNP.toString, + "nInsertion" -> nIns.toString, + "nDeletion" -> nDel.toString, + "nSingleton" -> nSingleton.toString, + "nTransition" -> nTi.toString, + "nTransversion" -> nTv.toString, + "dpMean" -> someIf(dpSC.count > 0, dpSC.mean).toString, + "dpStDev" -> someIf(dpSC.count > 0, dpSC.stdev).toString, + "dpMeanHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.mean).toString, + "dpStDevHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.stdev).toString, + "dpMeanHet" -> someIf(dpHetSC.count > 0, dpHetSC.mean).toString, + "dpStDevHet" -> someIf(dpHetSC.count > 0, dpHetSC.stdev).toString, + "dpMeanHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.mean).toString, + "dpStDevHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.stdev).toString, + "gqMean" -> someIf(gqSC.count > 0, gqSC.mean).toString, + "gqStDev" -> someIf(gqSC.count > 0, gqSC.stdev).toString, + "gqMeanHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.mean).toString, + "gqStDevHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.stdev).toString, + "gqMeanHet" -> someIf(gqHetSC.count > 0, gqHetSC.mean).toString, + "gqStDevHet" -> someIf(gqHetSC.count > 0, gqHetSC.stdev).toString, + "gqMeanHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.mean).toString, + "gqStDevHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.stdev).toString, + "nNonRef" -> (nHet + nHomVar).toString, + "rTiTv" -> divOption(nTi, nTv).toString, + "rHetHomVar" -> divOption(nHet, nHomVar).toString, + "rDeletionInsertion" -> divOption(nDel, nIns).toString) + } + } object SampleQC extends Command { class Options extends BaseOptions { - @Args4jOption(required = true, name = "-o", aliases = Array("--output"), usage = "Output file") - var output: String = _ + + @Args4jOption(required = false, name = "-o", aliases = Array("--output"), + usage = "Output file", forbids = Array("store")) + var output: String = "" + + @Args4jOption(required = false, name = "-s", aliases = Array("--store"), + usage = "Store qc output in vds annotations", forbids = Array("output")) + var store: Boolean = false + } def newOptions = new Options @@ -249,25 +323,42 @@ object SampleQC extends Command { val output = options.output - writeTextFile(output + ".header", state.hadoopConf) { s => - s.write("sampleID\t") - s.write(SampleQCCombiner.header) - s.write("\n") + if (options.store) { + val singletons = sSingletonVariants(vds) + val sampleIdsBc = state.sc.broadcast(vds.sampleIds) + val r = results(vds, singletons).collectAsMap() + val newAnnotations = vds.metadata.sampleAnnotations + .zipWithIndex + .map{ case (sa, s) => sa.addMap("qc", r(s).asMap) } + state.copy( + vds = vds.copy( + metadata=vds.metadata.copy( + sampleAnnotations = newAnnotations, + sampleAnnotationSignatures = vds.metadata.sampleAnnotationSignatures + .addMap("qc", SampleQCCombiner.signatures)))) + } + else { + + writeTextFile(output + ".header", state.hadoopConf) { s => + s.write("sampleID\t") + s.write(SampleQCCombiner.header) + s.write("\n") + } + + val singletons = sSingletonVariants(vds) + val sampleIdsBc = state.sc.broadcast(vds.sampleIds) + + hadoopDelete(output, state.hadoopConf, true) + val r = results(vds, singletons) + .map { case (s, comb) => + val sb = new StringBuilder() + sb.append(sampleIdsBc.value(s)) + sb += '\t' + comb.emit(sb) + sb.result() + }.saveAsTextFile(output) + + state } - - val singletons = sSingletonVariants(vds) - val sampleIdsBc = state.sc.broadcast(vds.sampleIds) - - hadoopDelete(output, state.hadoopConf, true) - val r = results(vds, singletons) - .map { case (s, comb) => - val sb = new StringBuilder() - sb.append(sampleIdsBc.value(s)) - sb += '\t' - comb.emit(sb) - sb.result() - }.saveAsTextFile(output) - - state } } diff --git a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala index 98af131f3bc..a6c80de11cd 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala @@ -4,6 +4,7 @@ import org.apache.commons.math3.distribution.BinomialDistribution import org.apache.spark.rdd.RDD import org.apache.spark.util.StatCounter import org.broadinstitute.hail.variant._ +import org.broadinstitute.hail.annotations._ import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.stats.LeveneHaldane import org.kohsuke.args4j.{Option => Args4jOption} @@ -29,6 +30,34 @@ object VariantQCCombiner { "rHeterozygosity\t" + "rHetHomVar\t" + "rExpectedHetFrequency\tpHWE\t" + + val signatures = Map("nCalled" -> new SimpleSignature("Int", "toInt", ""), + "nNotCalled" -> new SimpleSignature("Int", "toInt", ""), + "nHomRef" -> new SimpleSignature("Int", "toInt", ""), + "nHet" -> new SimpleSignature("Int", "toInt", ""), + "nHomVar" -> new SimpleSignature("Int", "toInt", ""), + "dpMean" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpStDev" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqMean" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqStDev" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "MAF" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "nNonRef" -> new SimpleSignature("Int", "toInt", ""), + "rHeterozygosity" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "rHetHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "rExpectedHetFrequency" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), + "pHWE" -> new SimpleSignature("Double", "toDouble", "")) } class VariantQCCombiner extends Serializable { @@ -171,13 +200,55 @@ class VariantQCCombiner extends Serializable { sb.tsvAppend(hwe._1) sb.append(hwe._2) } + + def asMap: Map[String, String] = { + val maf = { + val refAlleles = nHomRef * 2 + nHet + val altAlleles = nHomVar * 2 + nHet + divOption(altAlleles, refAlleles + altAlleles)} + + val hwe = HWEStats + + Map("nCalled" -> (nHomRef + nHet + nHomVar).toString, + "nNotCalled" -> nNotCalled.toString, + "nHomRef" -> nHomRef.toString, + "nHet" -> nHet.toString, + "nHomVar" -> nHomVar.toString, + "dpMean" -> someIf(dpSC.count > 0, dpSC.mean).toString, + "dpStDev" -> someIf(dpSC.count > 0, dpSC.stdev).toString, + "dpMeanHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.mean).toString, + "dpStDevHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.stdev).toString, + "dpMeanHet" -> someIf(dpHetSC.count > 0, dpHetSC.mean).toString, + "dpStDevHet" -> someIf(dpHetSC.count > 0, dpHetSC.stdev).toString, + "dpMeanHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.mean).toString, + "dpStDevHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.stdev).toString, + "gqMean" -> someIf(gqSC.count > 0, gqSC.mean).toString, + "gqStDev" -> someIf(gqSC.count > 0, gqSC.stdev).toString, + "gqMeanHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.mean).toString, + "gqStDevHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.stdev).toString, + "gqMeanHet" -> someIf(gqHetSC.count > 0, gqHetSC.mean).toString, + "gqStDevHet" -> someIf(gqHetSC.count > 0, gqHetSC.stdev).toString, + "gqMeanHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.mean).toString, + "gqStDevHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.stdev).toString, + "MAF" -> maf.toString, + "nNonRef" -> (nHet + nHomVar).toString, + "rHeterozygosity" -> divOption(nHet, nHomRef + nHet + nHomVar).toString, + "rHetHomVar" -> divOption(nHet, nHomVar).toString, + "rExpectedHetFrequency" -> hwe._1.toString, + "pHWE" -> hwe._2.toString) + } } object VariantQC extends Command { class Options extends BaseOptions { - @Args4jOption(required = true, name = "-o", aliases = Array("--output"), usage = "Output file") - var output: String = _ + @Args4jOption(required = false, name = "-o", aliases = Array("--output"), + usage = "Output file", forbids = Array("store")) + var output: String = "" + + @Args4jOption(required = false, name = "-s", aliases = Array("--store"), + usage = "Store qc output in vds annotations", forbids = Array("output")) + var store: Boolean = false } def newOptions = new Options @@ -191,33 +262,43 @@ object VariantQC extends Command { .aggregateByVariant(new VariantQCCombiner)((comb, g) => comb.merge(g), (comb1, comb2) => comb1.merge(comb2)) + def run(state: State, options: Options): State = { val vds = state.vds val output = options.output - writeTextFile(output + ".header", state.hadoopConf) { s => - s.write("Chrom\tPos\tRef\tAlt\t") - s.write(VariantQCCombiner.header) - s.write("\n") + if (options.store) + state.copy(vds = vds.mapAnnotationsWithAggregate(new VariantQCCombiner)((comb, v, s, g) => comb.merge(g), + (comb1, comb2) => comb1.merge(comb2), + (ad: AnnotationData, comb: VariantQCCombiner) => ad.addMap("qc", comb.asMap)) + .addVariantSignatures(Map("qc" -> VariantQCCombiner.signatures))) + else { + writeTextFile(output + ".header", state.hadoopConf) { s => + s.write("Chrom\tPos\tRef\tAlt\t") + s.write(VariantQCCombiner.header) + s.write("\n") + } + + val qcResults = results(vds) + + hadoopDelete(output, state.hadoopConf, true) + val r = results(vds) + .map { case (v, comb) => + val sb = new StringBuilder() + sb.append(v.contig) + sb += '\t' + sb.append(v.start) + sb += '\t' + sb.append(v.ref) + sb += '\t' + sb.append(v.alt) + sb += '\t' + comb.emit(sb) + sb.result() + }.saveAsTextFile(output) + + state } - - hadoopDelete(output, state.hadoopConf, true) - val r = results(vds) - .map { case (v, comb) => - val sb = new StringBuilder() - sb.append(v.contig) - sb += '\t' - sb.append(v.start) - sb += '\t' - sb.append(v.ref) - sb += '\t' - sb.append(v.alt) - sb += '\t' - comb.emit(sb) - sb.result() - }.saveAsTextFile(output) - - state } } diff --git a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala new file mode 100644 index 00000000000..a85f54a36a6 --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala @@ -0,0 +1,49 @@ +package org.broadinstitute.hail.methods + +import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ +import org.broadinstitute.hail.annotations._ +import org.broadinstitute.hail.variant.{Sample, Variant} +import scala.language.implicitConversions + +object Formatter { + def writeOption(o: Option[Any], missingValue: String): String = o match { + case Some(x) => x.toString + case None => missingValue + } +} + +class Formatter[T](val t: T) extends AnyVal { + def flattenOptions(missingValue: String): String = t match { + case x: Option[Any] => Formatter.writeOption(x, missingValue) + case _ => t.toString + } +} + +object ExportUtils { + implicit def toFormatter[T](t: T): Formatter[T] = new Formatter(t) +} + + +class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures, missingValue: String) + extends Evaluator[(Variant, AnnotationData) => String]({ + "(v: org.broadinstitute.hail.variant.Variant, \n" + + "__va: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + + "import org.broadinstitute.hail.methods.FilterUtils._; \n" + + "import org.broadinstitute.hail.methods.ExportUtils._; \n" + + signatures(vas, "__va") + + instantiate("va", "__va") + + s"""Array($list).map(_.flattenOptions("$missingValue")).reduceRight(_ + "\t" + _)}: String"""}) { + def apply(v: Variant, va: AnnotationData): String = eval()(v, va) +} + +class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures, missingValue: String) + extends Evaluator[(Sample, AnnotationData) => String]({ + "(s: org.broadinstitute.hail.variant.Sample, \n" + + "__sa: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + + "import org.broadinstitute.hail.methods.FilterUtils._; \n" + + "import org.broadinstitute.hail.methods.ExportUtils._; \n" + + signatures(sas, "__sa") + + instantiate("sa", "__sa") + + s"""Array($list).map(_.flattenOptions("$missingValue")).reduceRight(_ + "\t" + _)}: String"""}) { + def apply(s: Sample, sa: AnnotationData): String = eval()(s, sa) +} \ No newline at end of file diff --git a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala index 0a8830b1438..2278d8b4edb 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala @@ -1,7 +1,11 @@ package org.broadinstitute.hail.methods +import org.apache.spark.SparkContext import org.broadinstitute.hail.Utils -import org.broadinstitute.hail.variant.{Sample, Genotype, Variant} +import org.broadinstitute.hail.annotations._ +import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ +import org.broadinstitute.hail.methods.FilterUtils.{FilterGenotypePostSA, FilterGenotypeWithSA} +import org.broadinstitute.hail.variant._ import scala.reflect.ClassTag import scala.language.implicitConversions @@ -11,8 +15,52 @@ class FilterString(val s: String) extends AnyVal { def !~(t: String): Boolean = !this.~(t) } +object ConvertibleString { + val someRegex = """Some\(([0-9\.]+)\)""".r +} + +class ConvertibleString(val s: String) extends AnyVal { + def toArrayInt: Array[Int] = s.split(",").map(i => i.toInt) + def toArrayDouble: Array[Double] = s.split(",").map(i => i.toDouble) + def toSetString: Set[String] = s.split(",").toSet + def toStupidAnnotation: Array[Array[String]] = s.split(",").map(_.split("|").map(_.trim)) + def toOptionInt: Option[Int] = s match { + case ConvertibleString.someRegex(i) => Some(i.toInt) + case "None" => None + } + def toOptionDouble: Option[Double] = s match { + case ConvertibleString.someRegex(i) => Some(i.toDouble) + case "None" => None + } +} + object FilterUtils { + type FilterGenotypeWithSA = (Array[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean))) + type FilterGenotypePostSA = (Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean) implicit def toFilterString(s: String): FilterString = new FilterString(s) + + implicit def toConvertibleString(s: String): ConvertibleString = new ConvertibleString(s) + +// def test(): (Variant, Annotations[String]) => Boolean = { +// throw new UnsupportedOperationException +// } +} + +class EvaluatorWithTransformation[T, S](t: String, f: T => S)(implicit tct: ClassTag[T]) extends Serializable { + @transient var p: Option[S] = None + + def typeCheck() { + require(p.isEmpty) + p = Some(f(Utils.eval[T](t))) + } + + def eval(): S = p match { + case null | None => + val v = f(Utils.eval[T](t)) + p = Some(v) + v + case Some(v) => v + } } class Evaluator[T](t: String)(implicit tct: ClassTag[T]) @@ -33,29 +81,44 @@ class Evaluator[T](t: String)(implicit tct: ClassTag[T]) } } -class FilterVariantCondition(cond: String) - extends Evaluator[(Variant) => Boolean]( - "(v: org.broadinstitute.hail.variant.Variant) => { " + - "import org.broadinstitute.hail.methods.FilterUtils._; " + - cond + " }: Boolean") { - def apply(v: Variant): Boolean = eval()(v) +class FilterVariantCondition(cond: String, vas: AnnotationSignatures) + extends Evaluator[(Variant, AnnotationData) => Boolean]({ + "(v: org.broadinstitute.hail.variant.Variant, \n" + + "__va: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + + "import org.broadinstitute.hail.methods.FilterUtils._; \n" + + signatures(vas, "__va") + + instantiate("va", "__va") + + cond + " }: Boolean"}) { + def apply(v: Variant, va: AnnotationData): Boolean = eval()(v, va) } -class FilterSampleCondition(cond: String) - extends Evaluator[(Sample) => Boolean]( - "(s: org.broadinstitute.hail.variant.Sample) => { " + +class FilterSampleCondition(cond: String, sas: AnnotationSignatures) + extends Evaluator[(Sample, AnnotationData) => Boolean]( + "(s: org.broadinstitute.hail.variant.Sample, \n" + + "__sa: org.broadinstitute.hail.annotations.AnnotationData) => { " + "import org.broadinstitute.hail.methods.FilterUtils._; " + + signatures(sas, "__sa") + + instantiate("sa", "__sa") + cond + " }: Boolean") { - def apply(s: Sample): Boolean = eval()(s) + def apply(s: Sample, sa: AnnotationData): Boolean = eval()(s, sa) } -class FilterGenotypeCondition(cond: String) - extends Evaluator[(Variant, Sample, Genotype) => Boolean]( - "(v: org.broadinstitute.hail.variant.Variant, " + +class FilterGenotypeCondition(cond: String, vas: AnnotationSignatures, sas: AnnotationSignatures, + sad: Array[AnnotationData]) + extends EvaluatorWithTransformation[FilterGenotypeWithSA, FilterGenotypePostSA]( + {"(__sa: Array[org.broadinstitute.hail.annotations.AnnotationData]) => {\n" + + "import org.broadinstitute.hail.methods.FilterUtils._\n" + + signatures(sas, "__sa") + + makeArray("__saArray", "__sa", "__sa") + + "(v: org.broadinstitute.hail.variant.Variant, " + + "__va: org.broadinstitute.hail.annotations.AnnotationData) => {\n" + + signatures(vas, "__va") + + instantiate("va", "__va") + + "(__sIndex: Int, " + "s: org.broadinstitute.hail.variant.Sample, " + - "g: org.broadinstitute.hail.variant.Genotype) => { " + - "import org.broadinstitute.hail.methods.FilterUtils._; " + - cond + " }: Boolean") { - def apply(v: Variant, s: Sample, g: Genotype): Boolean = - eval()(v, s, g) + "g: org.broadinstitute.hail.variant.Genotype) => {\n" + + "val sa = __saArray(__sIndex)\n" + + cond + " }: Boolean}}"}, t => t(sad)) { + def apply(sa: Array[AnnotationData])(v: Variant, va: AnnotationData)(sIndex: Int, s: Sample, g: Genotype): Boolean = + eval()(v, va)(sIndex, s, g) } diff --git a/src/main/scala/org/broadinstitute/hail/methods/LinearRegression.scala b/src/main/scala/org/broadinstitute/hail/methods/LinearRegression.scala index 441380648b9..8f889ad1c2e 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LinearRegression.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LinearRegression.scala @@ -135,7 +135,7 @@ object LinearRegression { val yypBc = sc.broadcast((y dot y) - (qty dot qty)) new LinearRegression(vds - .filterSamples(samplesWithCovDataBc.value.contains) + .filterSamples { case (s, sa) => samplesWithCovDataBc.value.contains(s) } .aggregateByVariantWithKeys[LinRegBuilder](new LinRegBuilder())( (lrb, v, s, g) => lrb.merge(sampleCovRowBc.value(s), g, yBc.value), (lrb1, lrb2) => lrb1.merge(lrb2)) diff --git a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala index e29255d1cdf..bdcbf950ea4 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala @@ -5,9 +5,48 @@ import org.apache.spark.{SparkConf, SparkContext} import org.broadinstitute.hail.variant._ import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.vcf +import org.broadinstitute.hail.annotations._ object LoadVCF { // FIXME move to VariantDataset + + val arrayRegex = """Array\[(\w+)\]""".r + val setRegex = """Set\[(\w+)\]""".r + def getConversionMethod(str: String): String = { + str match { + case arrayRegex(subType) => s"toArray$subType" + case setRegex(subType) => s"toSet$subType" + case _ => s"to$str" + } + } + + def parseInfoType(str: String): String = { + str match { + case "Flag" => "Boolean" + case "Integer" => "Int" + case "Float" => "Double" + case "String" => "String" + case "Character" => "String" + case _ => throw new UnsupportedOperationException("unexpected annotation type") + } + } + + def parseInfoLine(number: String, typeOf: String, desc: String): AnnotationSignature = { + val parsedType = parseInfoType(typeOf) + if (number == "0" || number == "1") { + new SimpleSignature(parsedType, getConversionMethod(parsedType), desc) + } + else if (number == "A" || number == "R" || number == "G") { + val arrType = s"Array[$parsedType]" + new SimpleSignature(arrType, getConversionMethod(arrType), desc) + } + else if (number == "." && parsedType == "String") { + new SimpleSignature(parsedType, getConversionMethod(parsedType), desc) + } + else + throw new UnsupportedOperationException + } + def apply(sc: SparkContext, file: String, compress: Boolean = true, @@ -25,6 +64,45 @@ object LoadVCF { .toArray } + val contigRegex ="""##contig=""".r + val contigLengths = { + val contigMap = headerLines.map { + case contigRegex(id, length) => + Some((id, length.toInt)) + case _ => None + }.flatMap(i => i) + .toMap + + if (contigMap.nonEmpty) + contigMap + else + null + } +// contigLengths.foreach { case (id, len) => println("contig=%s, length=%s".format(id, len))} + + val annoRegex = """##INFO=""".r + val annotationTypes = { + val annotationMap = headerLines.map { + case annoRegex(id, number, typeOf, desc) => Some(id, parseInfoLine(number, typeOf, desc)) + case _ => None + }.flatMap(i => i) + .toMap + + if (annotationMap.nonEmpty) + annotationMap + else + Map.empty[String, AnnotationSignature] + } + val annotationSignatures: AnnotationSignatures = Annotations[AnnotationSignature](Map("info" -> annotationTypes), + Map("filters" -> new SimpleSignature("Set[String]", "toSetString", "filters applied to site"), + "pass" -> new SimpleSignature("Boolean", "toBoolean", "filters were applied && this site passed"), + "multiallelic" -> new SimpleSignature("Boolean", "toBoolean", "Site is a split multiallelic"), + "qual" -> new SimpleSignature("Double", "toDouble", "vcf qual field"), + "rsid" -> new SimpleSignature("String", "toString", "site rdID"))) +// annotationTypes.foreach { +// case (id, v) => println(s"id=$id, type=$v") +// } + val headerLine = headerLines.last assert(headerLine(0) == '#' && headerLine(1) != '#') @@ -32,21 +110,24 @@ object LoadVCF { .split("\t") .drop(9) + val sampleAnnotations = EmptySampleAnnotations(sampleIds.length) + val sampleAnnotationSignatures = EmptyAnnotationSignatures() + val headerLinesBc = sc.broadcast(headerLines) val genotypes = sc.textFile(file, nPartitions.getOrElse(sc.defaultMinPartitions)) .mapPartitions { lines => val reader = vcf.HtsjdkRecordReader(headerLinesBc.value) lines.filter(line => !line.isEmpty && line(0) != '#') .flatMap(reader.readRecord) - .map { case (v, gs) => + .map { case (v, va, gs) => val b = new GenotypeStreamBuilder(v, compress) for (g <- gs) b += g - (v, b.result(): Iterable[Genotype]) + (v, va, b.result(): Iterable[Genotype]) } } - // FIXME null should be contig lengths - VariantSampleMatrix(VariantMetadata(null, sampleIds), genotypes) + VariantSampleMatrix(VariantMetadata(contigLengths, sampleIds, + headerLines, sampleAnnotations, sampleAnnotationSignatures, annotationSignatures), genotypes) } } diff --git a/src/main/scala/org/broadinstitute/hail/variant/Genotype.scala b/src/main/scala/org/broadinstitute/hail/variant/Genotype.scala index 3fd6730fc1d..4f218e926b6 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/Genotype.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/Genotype.scala @@ -60,9 +60,12 @@ case class Genotype(private val gt: Int, def gtType: GenotypeType = GenotypeType(gt) def gq: Int = { - assert(gt != -1) - val (pl1, pl2) = minPl - pl1 min pl2 min 99 + if (gt == -1) + 0 + else { + val (pl1, pl2) = minPl + pl1 min pl2 min 99 + } } def call: Option[Call] = { diff --git a/src/main/scala/org/broadinstitute/hail/variant/RichRow.scala b/src/main/scala/org/broadinstitute/hail/variant/RichRow.scala index ad53c74113a..eecd4e59bb2 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/RichRow.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/RichRow.scala @@ -1,5 +1,7 @@ package org.broadinstitute.hail.variant +import org.broadinstitute.hail.annotations._ + import scala.language.implicitConversions import org.apache.spark.sql.Row @@ -43,4 +45,9 @@ class RichRow(r: Row) { if (ir.isNullAt(1)) None else Some(ir.getInt(1)), ir.getAs[Array[Byte]](2)) } + + def getVariantAnnotations(i: Int): AnnotationData = { + val ir = r.getAs[Row](i) + Annotations[String](ir.getAs[Map[String, Map[String, String]]](0), ir.getAs[Map[String, String]](0)) + } } diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala index b5bce0f937a..d9ab9fc338a 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala @@ -1,7 +1,31 @@ package org.broadinstitute.hail.variant +import org.broadinstitute.hail.annotations._ + +object VariantMetadata { + def apply(contigLength: Map[String, Int], + sampleIds: Array[String]): VariantMetadata = VariantMetadata(contigLength, sampleIds, None, + EmptySampleAnnotations(sampleIds.length), EmptyAnnotationSignatures(), EmptyAnnotationSignatures()) + + def apply(contigLength: Map[String, Int], + sampleIds: Array[String], + vcfHeader: Array[String]): VariantMetadata = VariantMetadata(contigLength, sampleIds, Some(vcfHeader), + EmptySampleAnnotations(sampleIds.length), EmptyAnnotationSignatures(), EmptyAnnotationSignatures()) + + def apply(contigLength: Map[String, Int], sampleIds: Array[String], vcfHeader: Array[String], + sa: Array[AnnotationData], sas: AnnotationSignatures, vas: AnnotationSignatures): VariantMetadata = { + new VariantMetadata(contigLength, sampleIds, Some(vcfHeader), sa, sas, vas) + } +} + case class VariantMetadata(contigLength: Map[String, Int], - sampleIds: IndexedSeq[String]) { + sampleIds: Array[String], + vcfHeader: Option[Array[String]], + sampleAnnotations: Array[AnnotationData], + sampleAnnotationSignatures: AnnotationSignatures, + variantAnnotationSignatures: AnnotationSignatures) { + def nContigs: Int = contigLength.size + def nSamples: Int = sampleIds.length } diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala index 735285d0a53..1a5b152ca06 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala @@ -7,6 +7,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.broadinstitute.hail.Utils._ import scala.language.implicitConversions +import org.broadinstitute.hail.annotations._ import scala.reflect.ClassTag import scala.reflect.runtime.universe._ @@ -14,8 +15,9 @@ import scala.reflect.runtime.universe._ object VariantSampleMatrix { def apply(metadata: VariantMetadata, - rdd: RDD[(Variant, Iterable[Genotype])]): VariantDataset = { + rdd: RDD[(Variant, AnnotationData, Iterable[Genotype])]): VariantDataset = { new VariantSampleMatrix(metadata, rdd) + } def read(sqlContext: SQLContext, dirname: String): VariantDataset = { @@ -27,17 +29,18 @@ object VariantSampleMatrix { // val df = sqlContext.read.parquet(dirname + "/rdd.parquet") val df = sqlContext.parquetFile(dirname + "/rdd.parquet") - new VariantSampleMatrix[Genotype](metadata, df.rdd.map(r => (r.getVariant(0), r.getGenotypeStream(1)))) + new VariantSampleMatrix[Genotype](metadata, df.rdd.map(r => + (r.getVariant(0), r.getVariantAnnotations(1), r.getGenotypeStream(2)))) } } class VariantSampleMatrix[T](val metadata: VariantMetadata, val localSamples: Array[Int], - val rdd: RDD[(Variant, Iterable[T])]) + val rdd: RDD[(Variant, AnnotationData, Iterable[T])]) (implicit ttt: TypeTag[T], tct: ClassTag[T], vct: ClassTag[Variant]) { - def this(metadata: VariantMetadata, rdd: RDD[(Variant, Iterable[T])]) + def this(metadata: VariantMetadata, rdd: RDD[(Variant, AnnotationData, Iterable[T])]) (implicit ttt: TypeTag[T], tct: ClassTag[T]) = this(metadata, Array.range(0, metadata.nSamples), rdd) @@ -49,7 +52,7 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, def copy[U](metadata: VariantMetadata = this.metadata, localSamples: Array[Int] = this.localSamples, - rdd: RDD[(Variant, Iterable[U])] = this.rdd) + rdd: RDD[(Variant, AnnotationData, Iterable[U])] = this.rdd) (implicit ttt: TypeTag[U], tct: ClassTag[U]): VariantSampleMatrix[U] = new VariantSampleMatrix(metadata, localSamples, rdd) @@ -61,24 +64,34 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, def nPartitions: Int = rdd.partitions.length - def variants: RDD[Variant] = rdd.keys + def variants: RDD[Variant] = rdd.map(_._1) + + def variantsAndAnnotations: RDD[(Variant, AnnotationData)] = rdd.map { case (v, va, gs) => (v, va) } def nVariants: Long = variants.count() def expand(): RDD[(Variant, Int, T)] = mapWithKeys[(Variant, Int, T)]((v, s, g) => (v, s, g)) + def expandWithAnnotation(): RDD[(Variant, AnnotationData, Int, T)] = + mapWithAll[(Variant, AnnotationData, Int, T)]((v, va, s, g) => (v, va, s, g)) def mapValues[U](f: (T) => U)(implicit utt: TypeTag[U], uct: ClassTag[U]): VariantSampleMatrix[U] = { - mapValuesWithKeys((v, s, g) => f(g)) + mapValuesWithAll((v, va, s, g) => f(g)) } def mapValuesWithKeys[U](f: (Variant, Int, T) => U) + (implicit utt: TypeTag[U], uct: ClassTag[U]): VariantSampleMatrix[U] = { + mapValuesWithAll((v, va, s, g) => f(v, s, g)) + } + + def mapValuesWithAll[U](f: (Variant, AnnotationData, Int, T) => U) (implicit utt: TypeTag[U], uct: ClassTag[U]): VariantSampleMatrix[U] = { val localSamplesBc = sparkContext.broadcast(localSamples) - copy(rdd = rdd.map { case (v, gs) => - (v, localSamplesBc.value.view.zip(gs.view) - .map { case (s, t) => f(v, s, t) }) + copy(rdd = rdd.map { case (v, va, gs) => + (v, va, localSamplesBc.value.view.zip(gs.view) + .map { case (s, t) => f(v, va, s, t) }) + }) } @@ -88,34 +101,43 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, def mapWithKeys[U](f: (Variant, Int, T) => U)(implicit uct: ClassTag[U]): RDD[U] = { val localSamplesBc = sparkContext.broadcast(localSamples) rdd - .flatMap { case (v, gs) => localSamplesBc.value.view.zip(gs.view) + .flatMap { case (v, va, gs) => localSamplesBc.value.view.zip(gs.view) .map { case (s, g) => f(v, s, g) } } } + def mapWithAll[U](f: (Variant, AnnotationData, Int, T) => U)(implicit uct: ClassTag[U]): RDD[U] = { + val localSamplesBc = sparkContext.broadcast(localSamples) + rdd + .flatMap { case (v, va, gs) => localSamplesBc.value.view.zip(gs.view) + .map { case (s, g) => f(v, va, s, g) } + } + } + def flatMap[U](f: T => TraversableOnce[U])(implicit uct: ClassTag[U]): RDD[U] = flatMapWithKeys((v, s, g) => f(g)) def flatMapWithKeys[U](f: (Variant, Int, T) => TraversableOnce[U])(implicit uct: ClassTag[U]): RDD[U] = { val localSamplesBc = sparkContext.broadcast(localSamples) rdd - .flatMap { case (v, gs) => localSamplesBc.value.view.zip(gs.view) + .flatMap { case (v, va, gs) => localSamplesBc.value.view.zip(gs.view) .flatMap { case (s, g) => f(v, s, g) } } } - def filterVariants(ilist: IntervalList): VariantSampleMatrix[T] = - filterVariants(v => ilist.contains(v.contig, v.start)) - - def filterVariants(p: (Variant) => Boolean): VariantSampleMatrix[T] = - copy(rdd = rdd.filter { case (v, _) => p(v) }) + def filterVariants(p: (Variant, Annotations[String]) => Boolean) = + copy(rdd = rdd.filter { case (v, va, gs) => p(v, va) }) - def filterSamples(p: (Int) => Boolean) = { - val localSamplesBc = sparkContext.broadcast(localSamples) - copy[T](localSamples = localSamples.filter(p), - rdd = rdd.map { case (v, gs) => - (v, localSamplesBc.value.view.zip(gs.view) - .filter { case (s, _) => p(s) } + def filterVariants(ilist: IntervalList): VariantSampleMatrix[T] = + filterVariants((v, va) => ilist.contains(v.contig, v.start)) + + def filterSamples(p: (Int, AnnotationData) => Boolean) = { + val localZipped = localSamples.zipWith(metadata.sampleAnnotations, (i: Int, j: AnnotationData) => (i, j)) + val localZippedBc = sparkContext.broadcast(localZipped) + copy[T](localSamples = localZipped.filter(p.tupled).map(_._1), + rdd = rdd.map { case (v, va, gs) => + (v, va, localZippedBc.value.view.zip(gs.view) + .filter { case ((s, sa), _) => p(s, sa) } .map(_._2)) }) } @@ -128,6 +150,12 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, def aggregateBySampleWithKeys[U](zeroValue: U)( seqOp: (U, Variant, Int, T) => U, combOp: (U, U) => U)(implicit utt: TypeTag[U], uct: ClassTag[U]): RDD[(Int, U)] = { + aggregateBySampleWithAll(zeroValue)((e, v, va, s, g) => seqOp(e, v, s, g), combOp) + } + + def aggregateBySampleWithAll[U](zeroValue: U)( + seqOp: (U, Variant, AnnotationData, Int, T) => U, + combOp: (U, U) => U)(implicit utt: TypeTag[U], uct: ClassTag[U]): RDD[(Int, U)] = { val localSamplesBc = sparkContext.broadcast(localSamples) @@ -137,15 +165,15 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, zeroBuffer.get(zeroArray) rdd - .mapPartitions { (it: Iterator[(Variant, Iterable[T])]) => + .mapPartitions { (it: Iterator[(Variant, AnnotationData, Iterable[T])]) => val serializer = SparkEnv.get.serializer.newInstance() def copyZeroValue() = serializer.deserialize[U](ByteBuffer.wrap(zeroArray)) val arrayZeroValue = Array.fill[U](localSamplesBc.value.length)(copyZeroValue()) localSamplesBc.value.iterator - .zip(it.foldLeft(arrayZeroValue) { case (acc, (v, gs)) => + .zip(it.foldLeft(arrayZeroValue) { case (acc, (v, va, gs)) => for ((g, i) <- gs.zipWithIndex) - acc(i) = seqOp(acc(i), v, localSamplesBc.value(i), g) + acc(i) = seqOp(acc(i), v, va, localSamplesBc.value(i), g) acc }.iterator) }.foldByKey(zeroValue)(combOp) @@ -154,11 +182,17 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, def aggregateByVariant[U](zeroValue: U)( seqOp: (U, T) => U, combOp: (U, U) => U)(implicit utt: TypeTag[U], uct: ClassTag[U]): RDD[(Variant, U)] = - aggregateByVariantWithKeys(zeroValue)((e, v, s, g) => seqOp(e, g), combOp) + aggregateByVariantWithAll(zeroValue)((e, v, va, s, g) => seqOp(e, g), combOp) def aggregateByVariantWithKeys[U](zeroValue: U)( seqOp: (U, Variant, Int, T) => U, combOp: (U, U) => U)(implicit utt: TypeTag[U], uct: ClassTag[U]): RDD[(Variant, U)] = { + aggregateByVariantWithAll(zeroValue)((e, v, va, s, g) => seqOp(e, v, s, g), combOp) + } + + def aggregateByVariantWithAll[U](zeroValue: U)( + seqOp: (U, Variant, AnnotationData, Int, T) => U, + combOp: (U, U) => U)(implicit utt: TypeTag[U], uct: ClassTag[U]): RDD[(Variant, U)] = { val localSamplesBc = sparkContext.broadcast(localSamples) @@ -168,12 +202,12 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, zeroBuffer.get(zeroArray) rdd - .map { case (v, gs) => + .map { case (v, va, gs) => val serializer = SparkEnv.get.serializer.newInstance() val zeroValue = serializer.deserialize[U](ByteBuffer.wrap(zeroArray)) (v, gs.zipWithIndex.foldLeft(zeroValue) { case (acc, (g, i)) => - seqOp(acc, v, localSamplesBc.value(i), g) + seqOp(acc, v, va, localSamplesBc.value(i), g) }) } } @@ -189,12 +223,12 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, zeroBuffer.get(zeroArray) rdd - .mapPartitions { (it: Iterator[(Variant, Iterable[T])]) => + .mapPartitions { (it: Iterator[(Variant, AnnotationData, Iterable[T])]) => val serializer = SparkEnv.get.serializer.newInstance() def copyZeroValue() = serializer.deserialize[T](ByteBuffer.wrap(zeroArray))(localtct) val arrayZeroValue = Array.fill[T](localSamplesBc.value.length)(copyZeroValue()) localSamplesBc.value.iterator - .zip(it.foldLeft(arrayZeroValue) { case (acc, (v, gs)) => + .zip(it.foldLeft(arrayZeroValue) { case (acc, (v, va, gs)) => for ((g, i) <- gs.zipWithIndex) acc(i) = combOp(acc(i), g) acc @@ -203,19 +237,53 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, } def foldByVariant(zeroValue: T)(combOp: (T, T) => T): RDD[(Variant, T)] = - rdd.mapValues(_.foldLeft(zeroValue)((acc, g) => combOp(acc, g))) + rdd.map { case (v, va, gs) => (v, gs.foldLeft(zeroValue)((acc, g) => combOp(acc, g))) } def same(that: VariantSampleMatrix[T]): Boolean = { metadata == that.metadata && localSamples.sameElements(that.localSamples) && - rdd.fullOuterJoin(that.rdd) + rdd.map { case (v, va, gs) => (v, (va, gs)) } + .fullOuterJoin(that.rdd.map { case (v, va, gs) => (v, (va, gs)) }) .map { case (v, t) => t match { - case (Some(it1), Some(it2)) => - it1.sameElements(it2) + case (Some((va1, it1)), Some((va2, it2))) => + it1.sameElements(it2) && va1.equals(va2) case _ => false } }.reduce(_ && _) } + def mapAnnotationsWithAggregate[U](zeroValue: U)( + seqOp: (U, Variant, Int, T) => U, + combOp: (U, U) => U, + mapOp: (AnnotationData, U) => AnnotationData) + (implicit utt: TypeTag[U], uct: ClassTag[U]): VariantSampleMatrix[T] = { + val localSamplesBc = sparkContext.broadcast(localSamples) + // Serialize the zero value to a byte array so that we can get a new clone of it on each key + val zeroBuffer = SparkEnv.get.serializer.newInstance().serialize(zeroValue) + val zeroArray = new Array[Byte](zeroBuffer.limit) + zeroBuffer.get(zeroArray) + + this.copy(rdd = rdd + .map { case (v, va, gs) => + val serializer = SparkEnv.get.serializer.newInstance() + val zeroValue = serializer.deserialize[U](ByteBuffer.wrap(zeroArray)) + + (v, mapOp(va, gs.zipWithIndex.foldLeft(zeroValue) { case (acc, (g, i)) => + seqOp(acc, v, localSamplesBc.value(i), g) + }), gs) + }) + } + + def addVariantSignatures(maps: Map[String, Map[String, AnnotationSignature]] = Map.empty[String, Map[String, AnnotationSignature]], + vals: Map[String, AnnotationSignature] = Map.empty[String, AnnotationSignature]): VariantSampleMatrix[T] = { + this.copy(metadata = this.metadata.copy(variantAnnotationSignatures = + this.metadata.variantAnnotationSignatures.addMaps(maps).addVals(vals))) + } + + def addSampleSignatures(maps: Map[String, Map[String, AnnotationSignature]] = Map.empty[String, Map[String, AnnotationSignature]], + vals: Map[String, AnnotationSignature] = Map.empty[String, AnnotationSignature]): VariantSampleMatrix[T] = { + this.copy(metadata = this.metadata.copy(sampleAnnotationSignatures = + this.metadata.sampleAnnotationSignatures.addMaps(maps).addVals(vals))) + } } // FIXME AnyVal Scala 2.11 @@ -233,7 +301,7 @@ class RichVDS(vds: VariantDataset) { // rdd.toDF().write.parquet(dirname + "/rdd.parquet") vds.rdd - .map { case (v, gs) => (v, gs.toGenotypeStream(v, compress)) } + .map { case (v, va, gs) => (v, gs.toGenotypeStream(v, compress)) } .toDF() .saveAsParquetFile(dirname + "/rdd.parquet") } diff --git a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala index 12758d736b6..884eb4a33f2 100644 --- a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala +++ b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala @@ -2,6 +2,8 @@ package org.broadinstitute.hail.vcf import htsjdk.variant.variantcontext.Allele import org.broadinstitute.hail.variant._ +import org.broadinstitute.hail.annotations._ +import org.broadinstitute.hail.annotations.AnnotationUtils.annotationToString import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer @@ -13,17 +15,39 @@ class BufferedLineIterator(bit: BufferedIterator[String]) extends htsjdk.tribble override def next(): String = bit.next() - override def remove() { throw new UnsupportedOperationException } + override def remove() { + throw new UnsupportedOperationException + } } class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializable { - def readRecord(line: String): Iterator[(Variant, Iterator[Genotype])] = { - + def readRecord(line: String): Iterator[(Variant, AnnotationData, Iterator[Genotype])] = { val vc = codec.decode(line) + //maybe count tabs to get filter field + val pass = (vc.filtersWereApplied() && vc.getFilters.size() == 0).toString + val filts = { + if (vc.filtersWereApplied && vc.isNotFiltered) + "PASS" + else + vc.getFilters.toArray.map(_.toString).reduceRight(_ + "," + _) + } + val rsid = vc.getID +// println(s"nFilters=%d".format(filts.length)) +// println("qual=%.2f".format(vc.getPhredScaledQual)) +// println("Filters are: ") +// filts.foreach(println(_)) if (vc.isBiallelic) { val variant = Variant(vc.getContig, vc.getStart, vc.getReference.getBaseString, vc.getAlternateAllele(0).getBaseString) - Iterator.single((variant, + Iterator.single((variant, Annotations[String](Map[String, Map[String, String]]("info" -> vc.getAttributes + .asScala + .mapValues(annotationToString) + .toMap), + Map[String, String]( + "qual" -> vc.getPhredScaledQual.toString, + "filters" -> filts, + "pass" -> pass, + "rsid" -> rsid)), for (g <- vc.getGenotypes.iterator.asScala) yield { val gt = if (g.isNoCall) @@ -65,8 +89,19 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl val ref = vc.getReference val alts = vc.getAlternateAlleles.asScala.filter(_ != Allele.SPAN_DEL) val altIndices = alts.map(vc.getAlleleIndex) // index in the VCF, used to access AD and PL fields - val biVs = alts.map{ alt => Variant(vc.getContig, vc.getStart, ref.getBaseString, alt.getBaseString) } //FixMe: need to normalize strings - val biGBs = alts.map{ _ => new ArrayBuffer[Genotype] } + val biVs = alts.map { alt => (Variant(vc.getContig, vc.getStart, ref.getBaseString, alt.getBaseString), + Annotations[String](Map[String, Map[String, String]]("info" -> vc.getAttributes + .asScala + .mapValues(annotationToString) + .toMap), + Map[String, String]( + "qual" -> vc.getPhredScaledQual.toString, + "filters" -> filts, + "pass" -> pass, + "rsid" -> rsid, + "multiallelic" -> "true"))) + } //FixMe: need to normalize strings + val biGBs = alts.map { _ => new ArrayBuffer[Genotype] } for (g <- vc.getGenotypes.iterator.asScala) { for (((alt, j), i) <- alts.zip(altIndices).zipWithIndex) { @@ -79,7 +114,7 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl val ad = if (g.hasAD) { val gad = g.getAD (gad.sum - gad(j), gad(j)) // consistent with downcoding other alts to the ref -// (gad(0), gad(j)) // what bcftools does + // (gad(0), gad(j)) // what bcftools does } else (0, 0) @@ -95,17 +130,19 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl val gpl = g.getPL def pl(gt: Int) = (for (k <- 0 until n; l <- k until n; if List(k, l).count(_ == j) == gt) yield l * (l + 1) / 2 + k).map(gpl).min - (pl(0), pl(1), pl(2)) // for each downcoded genotype, minimum PL among original genotypes that downcode to it -// (gpl(0), gpl(j * (j + 1) / 2), gpl(j * (j + 1) / 2 + j)) // what bcftools does; ignores all het-non-ref PLs + (pl(0), pl(1), pl(2)) // for each downcoded genotype, minimum PL among original genotypes that downcode to it + // (gpl(0), gpl(j * (j + 1) / 2), gpl(j * (j + 1) / 2 + j)) // what bcftools does; ignores all het-non-ref PLs } else - (0,0,0) + (0, 0, 0) } else null biGBs(i) += Genotype(gt, ad, dp, pl) } } - biVs.iterator.zip(biGBs.iterator.map(_.iterator)) + biVs.iterator.zip(biGBs.iterator.map(_.iterator)).map { + case ((v, vi), gs) => (v, vi, gs) + } } } } diff --git a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala new file mode 100644 index 00000000000..d316fb3cf7f --- /dev/null +++ b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala @@ -0,0 +1,79 @@ +package org.broadinstitute.hail.annotations + +import org.broadinstitute.hail.SparkSuite +import org.broadinstitute.hail.Utils._ +import org.broadinstitute.hail.driver._ +import org.broadinstitute.hail.variant.{Genotype, IntervalList, Variant} +import org.scalacheck.Gen +import org.testng.annotations.Test +import org.broadinstitute.hail.methods.LoadVCF +import org.broadinstitute.hail.methods.FilterVariantCondition + +class AnnotationsSuite extends SparkSuite { + + def getFunction(cond: String, vas: AnnotationSignatures): (Variant, AnnotationData) => Boolean = { + cond match { + case c: String => + try { + val cf = new FilterVariantCondition(c, vas) + cf.typeCheck() + cf.apply + } catch { + case e: scala.tools.reflect.ToolBoxError => + /* e.message looks like: + reflective compilation has failed: + + ';' expected but '.' found. */ + fatal("parse error in condition: " + e.message.split("\n").last) + } + } + } + + @Test def test() { + + val vds = LoadVCF(sc, "src/test/resources/sample.vcf") + val state = State("", sc, sqlContext, vds) + +// val vds = LoadVCF(sc, "src/test/resources/linearRegression.vcf") + val vas = vds.metadata.variantAnnotationSignatures + + + //FIXME involve every type of thing we can generate, involve options... look at vcf spec 4.2 ... + val vTotal = vds.nVariants + + val cond1 = "true" + val p1 = getFunction(cond1, vas) + assert(vds.filterVariants(p1).nVariants == vTotal) + + + val cond2 = "va.info.FS == 0" + val p2 = getFunction(cond2, vas) + assert(vds.filterVariants(p2).nVariants == 132) + + val cond3 = "va.info.HWP == 1" + val p3 = getFunction(cond3, vas) + assert(vds.filterVariants(p3).nVariants == 159) + + val state2 = VariantQC.run(state, Array("--store")) +// state2.vds.metadata.variantAnnotationSignatures.maps.foreach{case (k,m) => + // m.foreach {case (k2,ss) => println(k2 + " " + ss.conversion)} } +// state2.vds.rdd.map { case (v,va,gs) => va } +// .collect() +// .apply(1) +// .maps("qc").foreach(println(_)) + println(FilterVariants.run(state2, Array("--keep", "-c", "(va.qc.MAF.isDefined && va.qc.MAF.get > 0.05)")) + .vds + .nVariants) + +// FilterGenotypes.run(state2, Array("--keep", "-c", "g.dp > 100")).vds +// .rdd +// .map { case (v, va, gs) => (v, va, gs.toArray) } +// .collect()(0)._3.foreach(println) + ExportVariants.run(state2, Array("--output", + "src/test/resources/sample.vcf.exportVariants", "-c", "v.contig,v.start,va.qc.rHetHomVar,va.qc.MAF,va.qc.dpMean")) + val state3 = SampleQC.run(state2, Array("--store")) + ExportSamples.run(state3, Array("--output", "src/test/resources/sample.vcf.exportSamples", "-c", "s.id,sa.qc.dpMean,sa.qc.nHet")) +// assert({val nV = .vds.nVariants; +// println(s"nV = $nV"); nV > 0}) + } +} diff --git a/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala b/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala index 5d0e2bfcb54..85f44e7c5f5 100644 --- a/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala +++ b/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala @@ -2,6 +2,7 @@ package org.broadinstitute.hail.utils import scala.util.Random import org.broadinstitute.hail.variant._ +import org.broadinstitute.hail.annotations._ import org.apache.spark.SparkContext import scala.math @@ -83,7 +84,7 @@ object TestRDDBuilder { nSamples(Int) -- number of samples (columns) to produce in VCF nVariants(Int) -- number of variants(rows) to produce in VCF sc(SparkContext) -- spark context in which to operate - vsmtype(String) -- sparky, tuple, or managed + vsmtype(String) -- sparky gqArray(Array[Array[Int]]] -- Int array of dimension (nVariants x nSamples) dpArray(Array[Array[Int]]] -- Int array of dimension (nVariants x nSamples) Returns a test VDS of the given parameters */ @@ -118,7 +119,7 @@ object TestRDDBuilder { b += Genotype(gt, ad, dp, pl) } - (variant, b.result(): Iterable[Genotype]) + (variant, EmptyAnnotations(), b.result(): Iterable[Genotype]) } VariantSampleMatrix(VariantMetadata(Map("1" -> 1000000), sampleList), streamRDD) } diff --git a/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala b/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala index 2de3c9a0c5e..18b899e5432 100644 --- a/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala @@ -111,7 +111,7 @@ class VSMSuite extends SparkSuite { val localKeep = keep val filtered = LoadVCF(sc, "src/test/resources/sample.vcf.gz") - .filterSamples(s => localKeep(s)) + .filterSamples((s, sa) => localKeep(s)) val filteredAsMap = filtered.mapWithKeys((v, s, g) => ((v, s), g)).collectAsMap() filteredAsMap.foreach { case (k, g) => simpleAssert(vdsAsMap(k) == g) } From ccee7d8be3f1f83998745ab8c4478afeb4bca571 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Wed, 9 Dec 2015 16:42:14 -0500 Subject: [PATCH 02/15] Ready for code review --- .../scala/org/broadinstitute/hail/Utils.scala | 7 +- .../annotations/AnnotationSignature.scala | 1 + .../hail/annotations/Annotations.scala | 15 +- .../hail/annotations/SimpleSignature.scala | 2 +- .../hail/annotations/StupidAnnotation.scala | 7 - .../hail/driver/ExportGenotypes.scala | 76 +++++++++ .../hail/driver/ExportSamples.scala | 4 +- .../hail/driver/ExportVariants.scala | 4 +- .../hail/driver/FilterGenotypes.scala | 9 +- .../broadinstitute/hail/driver/SampleQC.scala | 8 +- .../hail/driver/VariantQC.scala | 5 +- .../hail/methods/ExportTSV.scala | 41 ++++- .../broadinstitute/hail/methods/Filter.scala | 10 +- .../broadinstitute/hail/methods/LoadVCF.scala | 6 +- .../hail/variant/VariantMetadata.scala | 12 +- .../hail/variant/VariantSampleMatrix.scala | 21 +-- .../hail/annotations/AnnotationsSuite.scala | 148 +++++++++++------- .../hail/methods/ExportSuite.scala | 58 +++++++ .../hail/methods/FilterSuite.scala | 10 +- .../hail/variant/vsm/VSMSuite.scala | 60 ++++--- 20 files changed, 362 insertions(+), 142 deletions(-) delete mode 100644 src/main/scala/org/broadinstitute/hail/annotations/StupidAnnotation.scala create mode 100644 src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala create mode 100644 src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala diff --git a/src/main/scala/org/broadinstitute/hail/Utils.scala b/src/main/scala/org/broadinstitute/hail/Utils.scala index 07bc4dbf316..9240a181221 100644 --- a/src/main/scala/org/broadinstitute/hail/Utils.scala +++ b/src/main/scala/org/broadinstitute/hail/Utils.scala @@ -210,7 +210,7 @@ class RichRDD[T](val r: RDD[T]) extends AnyVal { def writeTable(filename: String, header: String = null) { if (header != null) writeTextFile(filename + ".header", r.sparkContext.hadoopConfiguration) {_.write(header)} - hadoopDelete(filename, r.sparkContext.hadoopConfiguration, true) + hadoopDelete(filename, r.sparkContext.hadoopConfiguration, recursive = true) r.saveAsTextFile(filename) } } @@ -240,6 +240,7 @@ class RichOption[T](val o: Option[T]) extends AnyVal { class RichStringBuilder(val sb: mutable.StringBuilder) extends AnyVal { def tsvAppend[T](v: Option[T]) { v match { + case Some(d: Double) => sb.append(stringFormatDouble(d)) case Some(x) => sb.append(x) case None => sb.append("NA") } @@ -414,6 +415,10 @@ object Utils { if (!p) throw new AssertionError } + def stringFormatDouble(d: Double): String = { + d.formatted("%.4e") + } + // FIXME Would be nice to have a version that averages three runs, perhaps even discarding an initial run. In this case the code block had better be functional! def printTime[T](block: => T) = { val timed = time(block) diff --git a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala index 8754baa86fe..e71b4a20ebd 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala @@ -4,4 +4,5 @@ abstract class AnnotationSignature { def buildCaseClasses: String def conversion: String def getType: String + } diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index a12072d2788..7c11391c3c3 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -64,12 +64,6 @@ case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T .--(newVals.keys) .++(newVals)) } - - def equals(other: Annotations[T]): Boolean = { - vals.forall { case (k, v) => other.vals.contains(k) && other.vals(k) == v} && - maps.forall { case (mName, m) => other.maps.contains(mName) && m.forall { - case (k, v) => other.maps(mName).contains(k) && other.maps(mName)(k) == v }} - } } object EmptyAnnotationSignatures { @@ -85,10 +79,9 @@ object EmptyAnnotations { } object EmptySampleAnnotations { - def apply(nSamples: Int): Array[AnnotationData] = { + def apply(nSamples: Int): IndexedSeq[AnnotationData] = { (0 until nSamples) .map(i => Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String])) - .toArray } } @@ -145,9 +138,9 @@ object AnnotationClassBuilder { s"val $exposedName = new ${hiddenClassName}Annotations($hiddenClassName)\n" } - def makeArray(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = { - s"val $hiddenOutputName: Array[${hiddenClassName}Annotations] = " + - "$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_))\n" + def makeIndexedSeq(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = { + s"val $hiddenOutputName: IndexedSeq[${hiddenClassName}Annotations] = " + + s"$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_))\n" } val arrayRegex = """Array\[(\w+)\]""".r diff --git a/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala index 2037358c290..e2ec601c691 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala @@ -1,6 +1,6 @@ package org.broadinstitute.hail.annotations -class SimpleSignature(scalaType: String, conversionMethod: String, description: String) +case class SimpleSignature(scalaType: String, conversionMethod: String, description: String) extends AnnotationSignature { def buildCaseClasses: String = "" diff --git a/src/main/scala/org/broadinstitute/hail/annotations/StupidAnnotation.scala b/src/main/scala/org/broadinstitute/hail/annotations/StupidAnnotation.scala deleted file mode 100644 index e62413e854f..00000000000 --- a/src/main/scala/org/broadinstitute/hail/annotations/StupidAnnotation.scala +++ /dev/null @@ -1,7 +0,0 @@ -package org.broadinstitute.hail.annotations - -class StupidAnnotation() extends AnnotationSignature { - def buildCaseClasses: String = throw new UnsupportedOperationException - def conversion: String = throw new UnsupportedOperationException - def getType: String = throw new UnsupportedOperationException -} diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala new file mode 100644 index 00000000000..3fc5d94a8ae --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala @@ -0,0 +1,76 @@ +package org.broadinstitute.hail.driver + +import org.broadinstitute.hail.Utils._ +import org.broadinstitute.hail.methods._ +import org.broadinstitute.hail.variant._ +import org.broadinstitute.hail.annotations._ +import org.kohsuke.args4j.{Option => Args4jOption} + +object ExportGenotypes extends Command { + + class Options extends BaseOptions { + + @Args4jOption(required = true, name = "-o", aliases = Array("--output"), + usage = "path of output tsv") + var output: String = _ + + @Args4jOption(required = true, name = "-c", aliases = Array("--condition"), + usage = "Comma-separated list of fields to be printed to tsv") + var condition: String = _ + + @Args4jOption(required = false, name = "--missing", + usage = "Format of missing values (Default: 'NA')") + var missing = "NA" + } + + def newOptions = new Options + + def name = "exportgenotypes" + + def description = "Export list of sample-variant information to tsv" + + def run(state: State, options: Options): State = { + val vds = state.vds + + val cond = options.condition + + val output = options.output + + val vas: AnnotationSignatures = state.vds.metadata.variantAnnotationSignatures + val sas: AnnotationSignatures = state.vds.metadata.sampleAnnotationSignatures + val sa = state.vds.metadata.sampleAnnotations + + val makeString: IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => + ((Int, Sample, Genotype) => String)) = try { + val cf = new ExportGenotypeEvaluator(options.condition, vas, sas, sa, options.missing) + cf.typeCheck() + cf.apply + } + catch { + case e: scala.tools.reflect.ToolBoxError => + /* e.message looks like: + reflective compilation has failed: + + ';' expected but '.' found. */ + fatal("parse error in condition: " + e.message.split("\n").last) + } + + val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) + + val stringVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => + makeString(sa)(v, va)(s, Sample(sampleIdsBc.value(s)), g)) + + writeTextFile(output + ".header", state.hadoopConf) { s => + s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) + s.write("\n") + } + + hadoopDelete(output, state.hadoopConf, recursive = true) + + stringVDS.rdd + .flatMap { case (v, va, strings) => strings} + .saveAsTextFile(output) + + state + } +} diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala index f0c3bc7b6fd..667d19c3f07 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala @@ -53,11 +53,11 @@ object ExportSamples extends Command { } writeTextFile(output + ".header", state.hadoopConf) { s => - s.write(cond.split(",").reduceRight(_ + "\t" + _)) + s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) s.write("\n") } - hadoopDelete(output, state.hadoopConf, true) + hadoopDelete(output, state.hadoopConf, recursive = true) vds.sparkContext.parallelize(vds.sampleIds.map(Sample).zip(vds.metadata.sampleAnnotations)) .map { case (s, sa) => makeString(s, sa)} diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala index 3f0a9768bb8..1a03c34eba3 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala @@ -53,11 +53,11 @@ object ExportVariants extends Command { } writeTextFile(output + ".header", state.hadoopConf) { s => - s.write(cond.split(",").reduceRight(_ + "\t" + _)) + s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) s.write("\n") } - hadoopDelete(output, state.hadoopConf, true) + hadoopDelete(output, state.hadoopConf, recursive = true) vds.variantsAndAnnotations .map { case (v, va) => makeString(v, va) } diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala index 69c3c8bd628..f2565a81ef6 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala @@ -35,7 +35,7 @@ object FilterGenotypes extends Command { if (!options.keep && !options.remove) fatal(name + ": one of `--keep' or `--remove' required") - val p: Array[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean)) = try { + val p: IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean)) = try { val cf = new FilterGenotypeCondition(options.condition, vas, sas, sa) cf.typeCheck() cf.apply @@ -50,14 +50,15 @@ object FilterGenotypes extends Command { } val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) - + val localKeep = options.keep + val localRemove = options.remove //FIXME put keep/remove logic here val newVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => if (p(sa)(v, va)(s, Sample(sampleIdsBc.value(s)), g)) { - g + if (localKeep) g else Genotype(-1, (0, 0), 0, null) } else { - Genotype(-1, (0, 0), 0, null) + if (localRemove) g else Genotype(-1, (0, 0), 0, null) }) state.copy(vds = newVDS) } diff --git a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala index e02ba694b19..805e49c0b6a 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala @@ -332,10 +332,10 @@ object SampleQC extends Command { .map{ case (sa, s) => sa.addMap("qc", r(s).asMap) } state.copy( vds = vds.copy( - metadata=vds.metadata.copy( - sampleAnnotations = newAnnotations, + metadata = vds.metadata.copy( sampleAnnotationSignatures = vds.metadata.sampleAnnotationSignatures - .addMap("qc", SampleQCCombiner.signatures)))) + .addMap("qc", SampleQCCombiner.signatures), + sampleAnnotations = newAnnotations))) } else { @@ -348,7 +348,7 @@ object SampleQC extends Command { val singletons = sSingletonVariants(vds) val sampleIdsBc = state.sc.broadcast(vds.sampleIds) - hadoopDelete(output, state.hadoopConf, true) + hadoopDelete(output, state.hadoopConf, recursive = true) val r = results(vds, singletons) .map { case (s, comb) => val sb = new StringBuilder() diff --git a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala index a6c80de11cd..158ba3aaaf9 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala @@ -198,7 +198,8 @@ class VariantQCCombiner extends Serializable { val hwe = HWEStats sb.tsvAppend(hwe._1) - sb.append(hwe._2) + sb += '\t' + sb ++= stringFormatDouble(hwe._2) } def asMap: Map[String, String] = { @@ -282,7 +283,7 @@ object VariantQC extends Command { val qcResults = results(vds) - hadoopDelete(output, state.hadoopConf, true) + hadoopDelete(output, state.hadoopConf, recursive = true) val r = results(vds) .map { case (v, comb) => val sb = new StringBuilder() diff --git a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala index a85f54a36a6..474cb533017 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala @@ -2,11 +2,13 @@ package org.broadinstitute.hail.methods import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ import org.broadinstitute.hail.annotations._ -import org.broadinstitute.hail.variant.{Sample, Variant} +import org.broadinstitute.hail.variant.{Sample, Variant, Genotype} +import org.broadinstitute.hail.Utils.stringFormatDouble import scala.language.implicitConversions object Formatter { def writeOption(o: Option[Any], missingValue: String): String = o match { + case Some(d: Double) => stringFormatDouble(d) case Some(x) => x.toString case None => missingValue } @@ -15,11 +17,18 @@ object Formatter { class Formatter[T](val t: T) extends AnyVal { def flattenOptions(missingValue: String): String = t match { case x: Option[Any] => Formatter.writeOption(x, missingValue) + case x: Iterable[Any] => x.map(_.toString).reduceRight(_ + "," + _) + case d: Double => stringFormatDouble(d) case _ => t.toString } } object ExportUtils { + type ExportGenotypeWithSA = (IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => String))) + type ExportGenotypePostSA = (Variant, AnnotationData) => ((Int, Sample, Genotype) => String) +} + +object UserExportUtils { implicit def toFormatter[T](t: T): Formatter[T] = new Formatter(t) } @@ -28,8 +37,8 @@ class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures, missingVa extends Evaluator[(Variant, AnnotationData) => String]({ "(v: org.broadinstitute.hail.variant.Variant, \n" + "__va: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + - "import org.broadinstitute.hail.methods.FilterUtils._; \n" + - "import org.broadinstitute.hail.methods.ExportUtils._; \n" + + "import org.broadinstitute.hail.methods.FilterUtils._\n" + + "import org.broadinstitute.hail.methods.UserExportUtils._\n" + signatures(vas, "__va") + instantiate("va", "__va") + s"""Array($list).map(_.flattenOptions("$missingValue")).reduceRight(_ + "\t" + _)}: String"""}) { @@ -40,10 +49,32 @@ class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures, missingVal extends Evaluator[(Sample, AnnotationData) => String]({ "(s: org.broadinstitute.hail.variant.Sample, \n" + "__sa: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + - "import org.broadinstitute.hail.methods.FilterUtils._; \n" + - "import org.broadinstitute.hail.methods.ExportUtils._; \n" + + "import org.broadinstitute.hail.methods.FilterUtils._\n" + + "import org.broadinstitute.hail.methods.UserExportUtils._\n" + signatures(sas, "__sa") + instantiate("sa", "__sa") + s"""Array($list).map(_.flattenOptions("$missingValue")).reduceRight(_ + "\t" + _)}: String"""}) { def apply(s: Sample, sa: AnnotationData): String = eval()(s, sa) +} + +class ExportGenotypeEvaluator(list: String, vas: AnnotationSignatures, sas: AnnotationSignatures, + sad: IndexedSeq[AnnotationData], missingValue: String) + extends EvaluatorWithTransformation[ExportUtils.ExportGenotypeWithSA, ExportUtils.ExportGenotypePostSA]( + {"(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData]) => {\n" + + "import org.broadinstitute.hail.methods.FilterUtils._\n" + + "import org.broadinstitute.hail.methods.UserExportUtils._\n" + + signatures(sas, "__sa") + + makeIndexedSeq("__saArray", "__sa", "__sa") + + "(v: org.broadinstitute.hail.variant.Variant, " + + "__va: org.broadinstitute.hail.annotations.AnnotationData) => {\n" + + signatures(vas, "__va") + + instantiate("va", "__va") + + "(__sIndex: Int, " + + "s: org.broadinstitute.hail.variant.Sample, " + + "g: org.broadinstitute.hail.variant.Genotype) => {\n" + + "val sa = __saArray(__sIndex)\n" + + s"""Array($list).map(_.flattenOptions("$missingValue")).reduceRight(_ + "\t" + _)}: String}}"""}, t => t(sad)) { + def apply(sa: IndexedSeq[AnnotationData]) + (v: Variant, va: AnnotationData)(sIndex: Int, s: Sample, g: Genotype): String = + eval()(v, va)(sIndex, s, g) } \ No newline at end of file diff --git a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala index 2278d8b4edb..dc0fc91a850 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala @@ -35,7 +35,7 @@ class ConvertibleString(val s: String) extends AnyVal { } object FilterUtils { - type FilterGenotypeWithSA = (Array[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean))) + type FilterGenotypeWithSA = (IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean))) type FilterGenotypePostSA = (Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean) implicit def toFilterString(s: String): FilterString = new FilterString(s) @@ -104,12 +104,12 @@ class FilterSampleCondition(cond: String, sas: AnnotationSignatures) } class FilterGenotypeCondition(cond: String, vas: AnnotationSignatures, sas: AnnotationSignatures, - sad: Array[AnnotationData]) + sad: IndexedSeq[AnnotationData]) extends EvaluatorWithTransformation[FilterGenotypeWithSA, FilterGenotypePostSA]( - {"(__sa: Array[org.broadinstitute.hail.annotations.AnnotationData]) => {\n" + + {"(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData]) => {\n" + "import org.broadinstitute.hail.methods.FilterUtils._\n" + signatures(sas, "__sa") + - makeArray("__saArray", "__sa", "__sa") + + makeIndexedSeq("__saArray", "__sa", "__sa") + "(v: org.broadinstitute.hail.variant.Variant, " + "__va: org.broadinstitute.hail.annotations.AnnotationData) => {\n" + signatures(vas, "__va") + @@ -119,6 +119,6 @@ class FilterGenotypeCondition(cond: String, vas: AnnotationSignatures, sas: Anno "g: org.broadinstitute.hail.variant.Genotype) => {\n" + "val sa = __saArray(__sIndex)\n" + cond + " }: Boolean}}"}, t => t(sad)) { - def apply(sa: Array[AnnotationData])(v: Variant, va: AnnotationData)(sIndex: Int, s: Sample, g: Genotype): Boolean = + def apply(sa: IndexedSeq[AnnotationData])(v: Variant, va: AnnotationData)(sIndex: Int, s: Sample, g: Genotype): Boolean = eval()(v, va)(sIndex, s, g) } diff --git a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala index bdcbf950ea4..f5f79a75032 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala @@ -78,7 +78,6 @@ object LoadVCF { else null } -// contigLengths.foreach { case (id, len) => println("contig=%s, length=%s".format(id, len))} val annoRegex = """##INFO=""".r val annotationTypes = { @@ -95,13 +94,10 @@ object LoadVCF { } val annotationSignatures: AnnotationSignatures = Annotations[AnnotationSignature](Map("info" -> annotationTypes), Map("filters" -> new SimpleSignature("Set[String]", "toSetString", "filters applied to site"), - "pass" -> new SimpleSignature("Boolean", "toBoolean", "filters were applied && this site passed"), + "pass" -> new SimpleSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"), "multiallelic" -> new SimpleSignature("Boolean", "toBoolean", "Site is a split multiallelic"), "qual" -> new SimpleSignature("Double", "toDouble", "vcf qual field"), "rsid" -> new SimpleSignature("String", "toString", "site rdID"))) -// annotationTypes.foreach { -// case (id, v) => println(s"id=$id, type=$v") -// } val headerLine = headerLines.last assert(headerLine(0) == '#' && headerLine(1) != '#') diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala index d9ab9fc338a..72d6b625a8f 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala @@ -4,24 +4,24 @@ import org.broadinstitute.hail.annotations._ object VariantMetadata { def apply(contigLength: Map[String, Int], - sampleIds: Array[String]): VariantMetadata = VariantMetadata(contigLength, sampleIds, None, + sampleIds: Array[String]): VariantMetadata = new VariantMetadata(contigLength, sampleIds, None, EmptySampleAnnotations(sampleIds.length), EmptyAnnotationSignatures(), EmptyAnnotationSignatures()) def apply(contigLength: Map[String, Int], sampleIds: Array[String], - vcfHeader: Array[String]): VariantMetadata = VariantMetadata(contigLength, sampleIds, Some(vcfHeader), + vcfHeader: Array[String]): VariantMetadata = new VariantMetadata(contigLength, sampleIds, Some(vcfHeader), EmptySampleAnnotations(sampleIds.length), EmptyAnnotationSignatures(), EmptyAnnotationSignatures()) def apply(contigLength: Map[String, Int], sampleIds: Array[String], vcfHeader: Array[String], - sa: Array[AnnotationData], sas: AnnotationSignatures, vas: AnnotationSignatures): VariantMetadata = { + sa: IndexedSeq[AnnotationData], sas: AnnotationSignatures, vas: AnnotationSignatures): VariantMetadata = { new VariantMetadata(contigLength, sampleIds, Some(vcfHeader), sa, sas, vas) } } case class VariantMetadata(contigLength: Map[String, Int], - sampleIds: Array[String], - vcfHeader: Option[Array[String]], - sampleAnnotations: Array[AnnotationData], + sampleIds: IndexedSeq[String], + vcfHeader: Option[IndexedSeq[String]], + sampleAnnotations: IndexedSeq[AnnotationData], sampleAnnotationSignatures: AnnotationSignatures, variantAnnotationSignatures: AnnotationSignatures) { diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala index 1a5b152ca06..2fdb458406e 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala @@ -125,19 +125,21 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, } } - def filterVariants(p: (Variant, Annotations[String]) => Boolean) = + def filterVariants(p: (Variant, Annotations[String]) => Boolean): VariantSampleMatrix[T] = copy(rdd = rdd.filter { case (v, va, gs) => p(v, va) }) def filterVariants(ilist: IntervalList): VariantSampleMatrix[T] = filterVariants((v, va) => ilist.contains(v.contig, v.start)) - def filterSamples(p: (Int, AnnotationData) => Boolean) = { - val localZipped = localSamples.zipWith(metadata.sampleAnnotations, (i: Int, j: AnnotationData) => (i, j)) - val localZippedBc = sparkContext.broadcast(localZipped) - copy[T](localSamples = localZipped.filter(p.tupled).map(_._1), + def filterSamples(p: (Int, AnnotationData) => Boolean): VariantSampleMatrix[T] = { + val mask = localSamples.zip(metadata.sampleAnnotations).map { case (s, sa) => p(s, sa) } + val maskBc = sparkContext.broadcast(mask) + copy[T](localSamples = localSamples.zipWithIndex + .filter { case (s, i) => mask(i) } + .map(_._1), rdd = rdd.map { case (v, va, gs) => - (v, va, localZippedBc.value.view.zip(gs.view) - .filter { case ((s, sa), _) => p(s, sa) } + (v, va, maskBc.value.view.zip(gs.view) + .filter(_._1) .map(_._2)) }) } @@ -240,13 +242,14 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, rdd.map { case (v, va, gs) => (v, gs.foldLeft(zeroValue)((acc, g) => combOp(acc, g))) } def same(that: VariantSampleMatrix[T]): Boolean = { + println(metadata == that.metadata) metadata == that.metadata && localSamples.sameElements(that.localSamples) && rdd.map { case (v, va, gs) => (v, (va, gs)) } .fullOuterJoin(that.rdd.map { case (v, va, gs) => (v, (va, gs)) }) .map { case (v, t) => t match { case (Some((va1, it1)), Some((va2, it2))) => - it1.sameElements(it2) && va1.equals(va2) + it1.sameElements(it2) && va1 == va2 case _ => false } }.reduce(_ && _) @@ -301,7 +304,7 @@ class RichVDS(vds: VariantDataset) { // rdd.toDF().write.parquet(dirname + "/rdd.parquet") vds.rdd - .map { case (v, va, gs) => (v, gs.toGenotypeStream(v, compress)) } + .map { case (v, va, gs) => (v, va, gs.toGenotypeStream(v, compress)) } .toDF() .saveAsParquetFile(dirname + "/rdd.parquet") } diff --git a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala index d316fb3cf7f..a1a92f6a204 100644 --- a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala @@ -6,74 +6,114 @@ import org.broadinstitute.hail.driver._ import org.broadinstitute.hail.variant.{Genotype, IntervalList, Variant} import org.scalacheck.Gen import org.testng.annotations.Test -import org.broadinstitute.hail.methods.LoadVCF -import org.broadinstitute.hail.methods.FilterVariantCondition +import org.broadinstitute.hail.methods._ +import org.broadinstitute.hail.methods.FilterUtils.toConvertibleString +import scala.language.implicitConversions +/** + * This testing suite evaluates the functionality of the [[org.broadinstitute.hail.annotations]] package + */ class AnnotationsSuite extends SparkSuite { - - def getFunction(cond: String, vas: AnnotationSignatures): (Variant, AnnotationData) => Boolean = { - cond match { - case c: String => - try { - val cf = new FilterVariantCondition(c, vas) - cf.typeCheck() - cf.apply - } catch { - case e: scala.tools.reflect.ToolBoxError => - /* e.message looks like: - reflective compilation has failed: - - ';' expected but '.' found. */ - fatal("parse error in condition: " + e.message.split("\n").last) - } - } - } - @Test def test() { + /* + The below tests are designed to check for a subset of variants and info fields, that: + 1. the types, conversion strings, and description strings agree with the VCF + 2. the strings stored in the AnnotationData classes agree with the VCF + 3. the strings stored in the AnnotationData classes convert correctly to the proper type + */ + val vds = LoadVCF(sc, "src/test/resources/sample.vcf") val state = State("", sc, sqlContext, vds) - -// val vds = LoadVCF(sc, "src/test/resources/linearRegression.vcf") val vas = vds.metadata.variantAnnotationSignatures + val variantAnnotationMap = vds.variantsAndAnnotations.collect().toMap + val firstVariant = Variant("20", 10019093, "A", "G") + val anotherVariant = Variant("20", 10036107, "T", "G") + assert(variantAnnotationMap.contains(firstVariant)) + assert(variantAnnotationMap.contains(anotherVariant)) - //FIXME involve every type of thing we can generate, involve options... look at vcf spec 4.2 ... - val vTotal = vds.nVariants + // type Int - INFO.DP + assert(vas.get("info", "DP").contains(SimpleSignature("Int", "toInt", + "Approximate read depth; some reads may have been filtered"))) + assert(variantAnnotationMap(firstVariant) + .get("info", "DP") + .contains("77560") && + variantAnnotationMap(firstVariant) + .get("info", "DP").get.toInt == 77560) + assert(variantAnnotationMap(anotherVariant) + .get("info", "DP") + .contains("20271") && + variantAnnotationMap(anotherVariant) + .get("info", "DP").get.toInt == 20271) - val cond1 = "true" - val p1 = getFunction(cond1, vas) - assert(vds.filterVariants(p1).nVariants == vTotal) + // type Double - INFO.HWP + assert(vas.get("info", "HWP").contains(SimpleSignature("Double", "toDouble", + "P value from test of Hardy Weinberg Equilibrium"))) + assert(variantAnnotationMap(firstVariant) + .contains("info", "HWP") && + D_==(variantAnnotationMap(firstVariant) + .get("info", "HWP").get.toDouble, 0.0001)) + assert(variantAnnotationMap(anotherVariant) + .contains("info", "HWP") && + D_==(variantAnnotationMap(anotherVariant) + .get("info", "HWP").get.toDouble, 0.8286)) + // type String - INFO.culprit + assert(vas.get("info", "culprit").contains(SimpleSignature("String", "toString", + "The annotation which was the worst performing in the Gaussian mixture model, " + + "likely the reason why the variant was filtered out"))) + assert(variantAnnotationMap(firstVariant) + .get("info", "culprit") + .contains("FS")) + assert(variantAnnotationMap(anotherVariant) + .get("info", "culprit") + .contains("FS")) - val cond2 = "va.info.FS == 0" - val p2 = getFunction(cond2, vas) - assert(vds.filterVariants(p2).nVariants == 132) + // type Array - INFO.AC (allele count) + assert(vas.get("info", "AC").contains(SimpleSignature("Array[Int]", "toArrayInt", + "Allele count in genotypes, for each ALT allele, in the same order as listed"))) + assert(variantAnnotationMap(firstVariant) + .get("info", "AC") + .contains("89") && + variantAnnotationMap(firstVariant) + .get("info", "AC").get.toArrayInt + .sameElements(Array(89))) + assert(variantAnnotationMap(anotherVariant) + .get("info", "AC") + .contains("13") && + variantAnnotationMap(anotherVariant) + .get("info", "AC").get.toArrayInt + .sameElements(Array(13))) - val cond3 = "va.info.HWP == 1" - val p3 = getFunction(cond3, vas) - assert(vds.filterVariants(p3).nVariants == 159) + // type Boolean/flag - INFO.DB (dbSNP membership) + assert(vas.get("info", "DB").contains(SimpleSignature("Boolean", "toBoolean", + "dbSNP Membership"))) + assert(variantAnnotationMap(firstVariant) + .get("info", "DB") + .contains("true") && + variantAnnotationMap(firstVariant) + .get("info", "DB").get.toBoolean) // .get.toBoolean == true + assert(!variantAnnotationMap(anotherVariant) + .contains("info", "DB")) - val state2 = VariantQC.run(state, Array("--store")) -// state2.vds.metadata.variantAnnotationSignatures.maps.foreach{case (k,m) => - // m.foreach {case (k2,ss) => println(k2 + " " + ss.conversion)} } -// state2.vds.rdd.map { case (v,va,gs) => va } -// .collect() -// .apply(1) -// .maps("qc").foreach(println(_)) - println(FilterVariants.run(state2, Array("--keep", "-c", "(va.qc.MAF.isDefined && va.qc.MAF.get > 0.05)")) - .vds - .nVariants) + //type Set[String] + assert(vas.get("filters").contains(SimpleSignature("Set[String]", "toSetString", "filters applied to site"))) + assert(variantAnnotationMap(firstVariant) + .get("filters").contains("PASS") && + variantAnnotationMap(firstVariant) + .get("filters").get.toSetString == Set[String]("PASS")) + assert(variantAnnotationMap(anotherVariant) + .get("filters").contains("VQSRTrancheSNP99.95to100.00") && + variantAnnotationMap(anotherVariant) + .get("filters").get.toSetString == Set[String]("VQSRTrancheSNP99.95to100.00")) -// FilterGenotypes.run(state2, Array("--keep", "-c", "g.dp > 100")).vds -// .rdd -// .map { case (v, va, gs) => (v, va, gs.toArray) } -// .collect()(0)._3.foreach(println) - ExportVariants.run(state2, Array("--output", - "src/test/resources/sample.vcf.exportVariants", "-c", "v.contig,v.start,va.qc.rHetHomVar,va.qc.MAF,va.qc.dpMean")) - val state3 = SampleQC.run(state2, Array("--store")) - ExportSamples.run(state3, Array("--output", "src/test/resources/sample.vcf.exportSamples", "-c", "s.id,sa.qc.dpMean,sa.qc.nHet")) -// assert({val nV = .vds.nVariants; -// println(s"nV = $nV"); nV > 0}) + // GATK PASS + assert(vas.get("pass").contains(SimpleSignature("Boolean", "toBoolean", + "filters were applied to vcf and this site passed"))) + assert(variantAnnotationMap(firstVariant) + .get("pass").contains("true")) + assert(variantAnnotationMap(anotherVariant) + .get("pass").contains("false")) } } diff --git a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala new file mode 100644 index 00000000000..55e26210674 --- /dev/null +++ b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala @@ -0,0 +1,58 @@ +package org.broadinstitute.hail.methods + +import org.broadinstitute.hail.SparkSuite +import org.broadinstitute.hail.driver._ +import org.testng.annotations.Test + +import scala.io.Source + +/** + * This testing suite evaluates the [[org.broadinstitute.hail.driver.ExportVariants]] + * and [[org.broadinstitute.hail.driver.ExportSamples]] commands, and verifies that + * their output agrees with [[org.broadinstitute.hail.driver.VariantQC]] and + * [[org.broadinstitute.hail.driver.SampleQC]] commands. + */ +class ExportSuite extends SparkSuite{ + + @Test def test() { + val vds = LoadVCF(sc, "src/test/resources/sample.vcf") + val state = State("", sc, sqlContext, vds) + + SampleQC.run(state, Array("-o" ,"src/test/resources/sample.vcf.sampleQC")) + val postSampleQC = SampleQC.run(state, Array("--store")) + + ExportSamples.run(postSampleQC, Array("-o" ,"src/test/resources/sample.vcf.exportSamples", "-c", + "s.id, sa.qc.nCalled,sa.qc.nNotCalled,sa.qc.nHomRef,sa.qc.nHet,sa.qc.nHomVar,sa.qc.nSNP,sa.qc.nInsertion," + + "sa.qc.nDeletion,sa.qc.nSingleton,sa.qc.nTransition,sa.qc.nTransversion,sa.qc.dpMean,sa.qc.dpStDev," + + "sa.qc.dpMeanHomRef,sa.qc.dpStDevHomRef,sa.qc.dpMeanHet,sa.qc.dpStDevHet,sa.qc.dpMeanHomVar," + + "sa.qc.dpStDevHomVar,sa.qc.gqMean,sa.qc.gqStDev,sa.qc.gqMeanHomRef,sa.qc.gqStDevHomRef,sa.qc.gqMeanHet," + + "sa.qc.gqStDevHet,sa.qc.gqMeanHomVar,sa.qc.gqStDevHomVar,sa.qc.nNonRef,sa.qc.rTiTv,sa.qc.rHetHomVar," + + "sa.qc.rDeletionInsertion")) + + val sQcOutput = Source.fromFile("src/test/resources/sample.vcf.sampleQC/part-00000") + .getLines().toSet + val sExportOutput = Source.fromFile("src/test/resources/sample.vcf.exportSamples/part-00000") + .getLines().toSet + + assert(sQcOutput == sExportOutput) + + VariantQC.run(state, Array("-o" ,"src/test/resources/sample.vcf.variantQC")) + + val postVariantQC = VariantQC.run(state, Array("--store")) + + ExportVariants.run(postVariantQC, Array("-o", "src/test/resources/sample.vcf.exportVariants", "-c", + "v.contig,v.start,v.ref,v.alt,va.qc.nCalled,va.qc.nNotCalled,va.qc.nHomRef,va.qc.nHet,va.qc.nHomVar,va.qc.dpMean,va.qc.dpStDev," + + "va.qc.dpMeanHomRef,va.qc.dpStDevHomRef,va.qc.dpMeanHet,va.qc.dpStDevHet,va.qc.dpMeanHomVar," + + "va.qc.dpStDevHomVar,va.qc.gqMean,va.qc.gqStDev,va.qc.gqMeanHomRef,va.qc.gqStDevHomRef," + + "va.qc.gqMeanHet,va.qc.gqStDevHet,va.qc.gqMeanHomVar,va.qc.gqStDevHomVar,va.qc.MAF,va.qc.nNonRef," + + "va.qc.rHeterozygosity,va.qc.rHetHomVar,va.qc.rExpectedHetFrequency,va.qc.pHWE")) + + val vQcOutput = Source.fromFile("src/test/resources/sample.vcf.variantQC/part-00000") + .getLines().toSet + val vExportOutput = Source.fromFile("src/test/resources/sample.vcf.exportVariants/part-00000") + .getLines().toSet + + assert(vQcOutput == vExportOutput) + + } +} diff --git a/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala index d76b8cd6441..4805ae4a1c0 100644 --- a/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala @@ -1,8 +1,6 @@ package org.broadinstitute.hail.methods -import java.io.File import org.broadinstitute.hail.SparkSuite -import org.broadinstitute.hail.driver.Main._ import org.broadinstitute.hail.driver.{FilterVariants, FilterSamples, FilterGenotypes, State} import org.testng.annotations.Test @@ -22,5 +20,13 @@ class FilterSuite extends SparkSuite { assert(!highGQ.exists { case (v, s, g) => g.call.exists(c => c.gq < 20) }) assert(highGQ.count{ case (v, s, g) => g.call.exists(c => c.gq >= 20) } == 31260) + + // the below command will test typing of runtime-generated code exposing annotations + FilterGenotypes.run(state, Array("--keep", "-c", + """assert(va.pass.getClass.getName == "boolean");""" + + """assert(va.info.AN.getClass.getName == "int");""" + + """assert(va.info.GQ_MEAN.getClass.getName == "double");""" + + """assert(va.info.AC.getClass.getName == "int[]");""" + + """assert(va.filters.getClass.getName.contains("scala.collection.immutable.Set"));true""")) } } diff --git a/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala b/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala index 18b899e5432..2490f3a96b1 100644 --- a/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala @@ -8,6 +8,7 @@ import scala.util.Random import scala.language.postfixOps import org.broadinstitute.hail.methods.LoadVCF import org.testng.annotations.Test +import org.broadinstitute.hail.annotations._ class VSMSuite extends SparkSuite { @@ -16,9 +17,9 @@ class VSMSuite extends SparkSuite { val vds2 = LoadVCF(sc, "src/test/resources/sample.vcf.gz") assert(vds1.same(vds2)) - val mdata1 = VariantMetadata(Map("1" -> 10, "2" -> 10), IndexedSeq("S1", "S2", "S3")) - val mdata2 = VariantMetadata(Map("1" -> 10, "2" -> 20), IndexedSeq("S1", "S2", "S3")) - val mdata3 = VariantMetadata(Map("1" -> 10), IndexedSeq("S1", "S2")) + val mdata1 = VariantMetadata(Map("1" -> 10, "2" -> 10), Array("S1", "S2", "S3")) + val mdata2 = VariantMetadata(Map("1" -> 10, "2" -> 20), Array("S1", "S2", "S3")) + val mdata3 = VariantMetadata(Map("1" -> 10), Array("S1", "S2")) assert(mdata1 != mdata2) assert(mdata1 != mdata3) @@ -28,47 +29,61 @@ class VSMSuite extends SparkSuite { val v2 = Variant("1", 2, "T", "G") val v3 = Variant("1", 2, "T", "A") - val rdd1 = sc.parallelize(Seq(v1 -> + val va1 = Annotations(Map("info" -> Map("v1thing" -> "yes")), Map("v1otherThing" -> "yes")) + val va2 = Annotations(Map("info" -> Map("v2thing" -> "yes")), Map("v2otherThing" -> "yes")) + val va3 = Annotations(Map("info" -> Map("v3thing" -> "yes")), Map("v3otherThing" -> "yes")) + + val rdd1 = sc.parallelize(Seq((v1, va1, Iterable(Genotype(-1, (0, 2), 2, null), Genotype(0, (11, 1), 12, (0, 10, 100)), - Genotype(2, (0, 13), 13, (100, 10, 0))), - v2 -> + Genotype(2, (0, 13), 13, (100, 10, 0)))), + (v2, va2, Iterable(Genotype(0, (10, 0), 10, (0, 10, 100)), Genotype(0, (11, 0), 11, (0, 10, 100)), - Genotype(1, (6, 6), 12, (50, 0, 50))))) + Genotype(1, (6, 6), 12, (50, 0, 50)))))) // differ in variant - val rdd2 = sc.parallelize(Seq(v1 -> + val rdd2 = sc.parallelize(Seq((v1, va1, Iterable(Genotype(-1, (0, 2), 2, null), Genotype(0, (11, 1), 12, (0, 10, 100)), - Genotype(2, (0, 13), 13, (100, 10, 0))), - v3 -> + Genotype(2, (0, 13), 13, (100, 10, 0)))), + (v3, va2, Iterable(Genotype(0, (10, 0), 10, (0, 10, 100)), Genotype(0, (11, 0), 11, (0, 10, 100)), - Genotype(1, (6, 6), 12, (50, 0, 50))))) + Genotype(1, (6, 6), 12, (50, 0, 50)))))) // differ in genotype - val rdd3 = sc.parallelize(Seq(v1 -> + val rdd3 = sc.parallelize(Seq((v1, va1, Iterable(Genotype(-1, (0, 2), 2, null), Genotype(1, (7, 8), 15, (100, 0, 100)), - Genotype(2, (0, 13), 13, (100, 10, 0))), - v2 -> + Genotype(2, (0, 13), 13, (100, 10, 0)))), + (v2, va2, Iterable(Genotype(0, (10, 0), 10, (0, 10, 100)), Genotype(0, (11, 0), 11, (0, 10, 100)), - Genotype(1, (6, 6), 12, (50, 0, 50))))) + Genotype(1, (6, 6), 12, (50, 0, 50)))))) // for mdata3 - val rdd4 = sc.parallelize(Seq(v1 -> + val rdd4 = sc.parallelize(Seq((v1, va1, Iterable(Genotype(-1, (0, 2), 2, null), - Genotype(0, (11, 1), 12, (0, 10, 100))), - v2 -> Iterable( + Genotype(0, (11, 1), 12, (0, 10, 100)))), + (v2, va2, Iterable( Genotype(0, (10, 0), 10, (0, 10, 100)), - Genotype(0, (11, 0), 11, (0, 10, 100))))) + Genotype(0, (11, 0), 11, (0, 10, 100)))))) // differ in number of variants - val rdd5 = sc.parallelize(Seq(v1 -> + val rdd5 = sc.parallelize(Seq((v1, va1, + Iterable(Genotype(-1, (0, 2), 2, null), + Genotype(0, (11, 1), 12, (0, 10, 100)))))) + + // differ in annotations + val rdd6 = sc.parallelize(Seq((v1, va1, Iterable(Genotype(-1, (0, 2), 2, null), - Genotype(0, (11, 1), 12, (0, 10, 100))))) + Genotype(0, (11, 1), 12, (0, 10, 100)), + Genotype(2, (0, 13), 13, (100, 10, 0)))), + (v2, va3, + Iterable(Genotype(0, (10, 0), 10, (0, 10, 100)), + Genotype(0, (11, 0), 11, (0, 10, 100)), + Genotype(1, (6, 6), 12, (50, 0, 50)))))) val vdss = Array(new VariantDataset(mdata1, rdd1), new VariantDataset(mdata1, rdd2), @@ -77,7 +92,8 @@ class VSMSuite extends SparkSuite { new VariantDataset(mdata2, rdd2), new VariantDataset(mdata2, rdd3), new VariantDataset(mdata3, rdd4), - new VariantDataset(mdata3, rdd5)) + new VariantDataset(mdata3, rdd5), + new VariantDataset(mdata1, rdd6)) for (i <- vdss.indices; j <- vdss.indices) { From 99db74296392dca7a344bb79169c8aa7001c2d65 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Thu, 10 Dec 2015 10:17:37 -0500 Subject: [PATCH 03/15] Ready for code review --- .../org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala index 884eb4a33f2..8efa00d2ba7 100644 --- a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala +++ b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala @@ -28,8 +28,12 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl val filts = { if (vc.filtersWereApplied && vc.isNotFiltered) "PASS" - else - vc.getFilters.toArray.map(_.toString).reduceRight(_ + "," + _) + else { + if (vc.getFilters.isEmpty) + "" + else + vc.getFilters.toArray.map(_.toString).reduceRight(_ + "," + _) + } } val rsid = vc.getID // println(s"nFilters=%d".format(filts.length)) From 59d8c081a72982e7a30d607354787522d6446d17 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Thu, 10 Dec 2015 15:15:38 -0500 Subject: [PATCH 04/15] Expand Exporting functionality -- allow exportation of annotation modules as one field or all fields contained --- .../hail/annotations/Annotations.scala | 27 ++++++++-- .../hail/driver/ExportGenotypes.scala | 41 ++++++++++++++++ .../hail/driver/ExportSamples.scala | 27 +++++++++- .../hail/driver/ExportVariants.scala | 27 +++++++++- .../hail/methods/ExportTSV.scala | 49 ++++++++++++++----- .../broadinstitute/hail/variant/Sample.scala | 4 +- .../broadinstitute/hail/variant/Variant.scala | 2 + .../hail/vcf/HtsjdkRecordReader.scala | 4 -- .../hail/methods/ExportSuite.scala | 7 ++- 9 files changed, 160 insertions(+), 28 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index 7c11391c3c3..4145202a976 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -89,7 +89,7 @@ object AnnotationUtils { def annotationToString(ar: AnyRef): String = { ar match { - case iter: Iterable[_] => iter.map(_.toString).reduceRight(_ + ", " + _) + case iter: Iterable[_] => if (iter.isEmpty) "" else iter.map(_.toString).reduceRight(_ + ", " + _) case _ => ar.toString } } @@ -107,23 +107,39 @@ object AnnotationUtils { object AnnotationClassBuilder { - def signatures(sigs: AnnotationSignatures, hiddenClassName: String): String = { + def signatures(sigs: AnnotationSignatures, hiddenClassName: String, + makeToString: Boolean = false, missing: String = ""): String = { val internalClasses = sigs.maps.map { case (subclass, subMap) => s"class __${subclass}Annotations(subMap: Map[String, String]) extends Serializable {\n" + subMap.map { case (k, sig) => -// s""" val $k: $kType = subMap.getOrElse("$k", \"false\").$kMethod\n""" + // s""" val $k: $kType = subMap.getOrElse("$k", \"false\").$kMethod\n""" val default = getDefault(sig.getType) s""" val $k: ${sig.getType} = subMap.getOrElse("$k", "$default").${sig.conversion}\n""" } - .foldRight[String]("")(_ + _) + "}\n" + .foldRight[String]("")(_ + _) + { + if (makeToString) { + val keys = subMap.keys.toArray.sorted + " def __fields: Array[String] = Array(" + { + if (keys.isEmpty) "" + else keys.map(_ + s""".formatString("$missing")""") + .reduceRight(_ + "," + _) + } + ")\n" + + """ override def toString: String = """ + + """if (__fields.length == 0) "" else __fields.reduceRight(_ + ";" + _)""" + "\n" + + """ def all: String = if (__fields.length == 0) "" else __fields.reduceRight(_ + "\t" + _)""" + "\n" + } + else "" + } + + "}\n" } .foldRight[String]("")(_ + _) val hiddenClass = s"class ${hiddenClassName}Annotations" + s"(annot: org.broadinstitute.hail.annotations.AnnotationData) extends Serializable {\n" + sigs.maps.map { case (subclass, subMap) => - s""" val $subclass = new __${subclass}Annotations(annot.maps(\"$subclass\"))\n""" } + s""" val $subclass = new __${subclass}Annotations(annot.maps(\"$subclass\"))\n""" + } .foldRight[String]("")(_ + _) + sigs.vals.map { case (k, sig) => val default = getDefault(sig.getType) @@ -145,6 +161,7 @@ object AnnotationClassBuilder { val arrayRegex = """Array\[(\w+)\]""".r val optionRegex = """Option\[(\w+)\]""".r + private def getDefault(typeStr: String): String = { if (typeStr == "Int" || typeStr == "Double") "0" diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala index 3fc5d94a8ae..3e70484fdcb 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala @@ -60,6 +60,47 @@ object ExportGenotypes extends Command { val stringVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => makeString(sa)(v, va)(s, Sample(sampleIdsBc.value(s)), g)) + // FIXME add additional command parsing functionality + val variantRegex = """v\.(\w+)""".r + val sampleRegex = """s\.(\w+)""".r + val topLevelSampleAnnoRegex = """sa\.(\w+)""".r + val topLevelVariantAnnoRegex = """va\.(\w+)""".r + val samplePrintMapRegex = """sa\.(\w+)\.all""".r + val variantPrintMapRegex = """va\.(\w+)\.all""".r + val annoRegex = """\wa\.(.+)""".r + def mapColumnNames(input: String): String = { + input match { + case "v" => "Variant" + case "s" => "Sample" + case "va" => + fatal("parse error in condition: cannot print 'va', choose a group or value in annotations") + case "sa" => + fatal("parse error in condition: cannot print 'sa', choose a group or value in annotations") + case variantRegex(x) => x + case sampleRegex(x) => x + case topLevelSampleAnnoRegex(x) => + if (sas.maps.contains(x)) { + val keys = sas.maps(x).keys.toArray.sorted + if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) + } + else x + case topLevelVariantAnnoRegex(x) => + if (vas.maps.contains(x)) { + val keys = vas.maps(x).keys.toArray.sorted + if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) + } + else x + case samplePrintMapRegex(x) => + val keys = sas.maps(x).keys + if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) + case variantPrintMapRegex(x) => + val keys = vas.maps(x).keys + if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) + case annoRegex(x) => x + case _ => input + } + } + writeTextFile(output + ".header", state.hadoopConf) { s => s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) s.write("\n") diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala index 667d19c3f07..fa48b732ec8 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala @@ -52,6 +52,31 @@ object ExportSamples extends Command { } } + // FIXME add additional command parsing functionality + val sampleRegex = """s\.(\w+)""".r + val topLevelAnnoRegex = """sa\.(\w+)""".r + val printMapRegex = """sa\.(\w+)\.all""".r + val annoRegex = """sa\.(.+)""".r + def mapColumnNames(input: String): String = { + input match { + case "v" => "Sample" + case "sa" => + fatal("parse error in condition: cannot print 'sa', choose a group or value in annotations") + case sampleRegex(x) => x + case topLevelAnnoRegex(x) => + if (sas.maps.contains(x)) { + val keys = sas.maps(x).keys.toArray.sorted + if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) + } + else x + case printMapRegex(x) => + val keys = sas.maps(x).keys + if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) + case annoRegex(x) => x + case _ => input + } + } + writeTextFile(output + ".header", state.hadoopConf) { s => s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) s.write("\n") @@ -60,7 +85,7 @@ object ExportSamples extends Command { hadoopDelete(output, state.hadoopConf, recursive = true) vds.sparkContext.parallelize(vds.sampleIds.map(Sample).zip(vds.metadata.sampleAnnotations)) - .map { case (s, sa) => makeString(s, sa)} + .map { case (s, sa) => makeString(s, sa) } .saveAsTextFile(output) state diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala index 1a03c34eba3..5cf2a1f0320 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala @@ -52,8 +52,33 @@ object ExportVariants extends Command { } } + // FIXME add additional command parsing functionality + val variantRegex = """v\.(\w+)""".r + val topLevelAnnoRegex = """va\.(\w+)""".r + val printMapRegex = """va\.(\w+)\.all""".r + val annoRegex = """va\.(.+)""".r + def mapColumnNames(input: String): String = { + input match { + case "v" => "Variant" + case "va" => + fatal("parse error in condition: cannot print 'va', choose a group or value in annotations") + case variantRegex(x) => x + case topLevelAnnoRegex(x) => + if (vas.maps.contains(x)) { + val keys = vas.maps(x).keys.toArray.sorted + if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) + } + else x + case printMapRegex(x) => + val keys = vas.maps(x).keys + if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) + case annoRegex(x) => x + case _ => input + } + } + writeTextFile(output + ".header", state.hadoopConf) { s => - s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) + s.write(cond.split(",").map(mapColumnNames).reduceRight(_ + "\t" + _)) s.write("\n") } diff --git a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala index 474cb533017..890bf6b0707 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala @@ -15,10 +15,11 @@ object Formatter { } class Formatter[T](val t: T) extends AnyVal { - def flattenOptions(missingValue: String): String = t match { + def formatString(missingValue: String): String = t match { case x: Option[Any] => Formatter.writeOption(x, missingValue) - case x: Iterable[Any] => x.map(_.toString).reduceRight(_ + "," + _) case d: Double => stringFormatDouble(d) + case x: Iterable[Double] => if (x.isEmpty) "" else x.map(stringFormatDouble).reduceRight(_ + "," + _) + case x: Iterable[Any] => if (x.isEmpty) "" else x.map(_.toString).reduceRight(_ + "," + _) case _ => t.toString } } @@ -30,30 +31,53 @@ object ExportUtils { object UserExportUtils { implicit def toFormatter[T](t: T): Formatter[T] = new Formatter(t) -} + class ExportVariant(val v: Variant) extends AnyVal { + def contig = v.contig + def start = v.start + def ref = v.ref + def alt = v.alt + def variantType = v.variantType + def inParX = v.inParX + def inParY = v.inParY + def isSNP = v.isSNP + def isMNP = v.isMNP + def isInsertion = v.isInsertion + def isDeletion = v.isDeletion + def isIndel = v.isIndel + def isCopmlex = v.isComplex + def isTransition = v.isTransition + def isTransversion = v.isTransversion + def nMismatch = v.nMismatch + override def toString: String = { + s"${contig}_${start}_${ref}_$alt" + } + } + +} class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures, missingValue: String) extends Evaluator[(Variant, AnnotationData) => String]({ - "(v: org.broadinstitute.hail.variant.Variant, \n" + + val a = "(__v: org.broadinstitute.hail.variant.Variant, \n" + "__va: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + "import org.broadinstitute.hail.methods.FilterUtils._\n" + "import org.broadinstitute.hail.methods.UserExportUtils._\n" + - signatures(vas, "__va") + + "val v: org.broadinstitute.hail.methods.UserExportUtils.ExportVariant = new ExportVariant(__v)\n" + + signatures(vas, "__va", makeToString = true, missing = missingValue) + instantiate("va", "__va") + - s"""Array($list).map(_.flattenOptions("$missingValue")).reduceRight(_ + "\t" + _)}: String"""}) { + s"""Array($list).map(_.formatString("$missingValue")).reduceRight(_ + "\t" + _)}: String""";println(a); a}) { def apply(v: Variant, va: AnnotationData): String = eval()(v, va) } class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures, missingValue: String) extends Evaluator[(Sample, AnnotationData) => String]({ - "(s: org.broadinstitute.hail.variant.Sample, \n" + + val a = "(s: org.broadinstitute.hail.variant.Sample, \n" + "__sa: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + "import org.broadinstitute.hail.methods.FilterUtils._\n" + "import org.broadinstitute.hail.methods.UserExportUtils._\n" + - signatures(sas, "__sa") + + signatures(sas, "__sa", makeToString = true, missing = missingValue) + instantiate("sa", "__sa") + - s"""Array($list).map(_.flattenOptions("$missingValue")).reduceRight(_ + "\t" + _)}: String"""}) { + s"""Array($list).map(_.formatString("$missingValue")).reduceRight(_ + "\t" + _)}: String"""; println(a); a}) { def apply(s: Sample, sa: AnnotationData): String = eval()(s, sa) } @@ -65,15 +89,16 @@ class ExportGenotypeEvaluator(list: String, vas: AnnotationSignatures, sas: Anno "import org.broadinstitute.hail.methods.UserExportUtils._\n" + signatures(sas, "__sa") + makeIndexedSeq("__saArray", "__sa", "__sa") + - "(v: org.broadinstitute.hail.variant.Variant, " + + "(__v: org.broadinstitute.hail.variant.Variant, " + "__va: org.broadinstitute.hail.annotations.AnnotationData) => {\n" + - signatures(vas, "__va") + + "val v: org.broadinstitute.hail.methods.UserExportUtils.ExportVariant = new ExportVariant(__v)\n" + + signatures(vas, "__va", makeToString = true, missing = missingValue) + instantiate("va", "__va") + "(__sIndex: Int, " + "s: org.broadinstitute.hail.variant.Sample, " + "g: org.broadinstitute.hail.variant.Genotype) => {\n" + "val sa = __saArray(__sIndex)\n" + - s"""Array($list).map(_.flattenOptions("$missingValue")).reduceRight(_ + "\t" + _)}: String}}"""}, t => t(sad)) { + s"""Array($list).map(_.formatString("$missingValue")).reduceRight(_ + "\t" + _)}: String}}"""}, t => t(sad)) { def apply(sa: IndexedSeq[AnnotationData]) (v: Variant, va: AnnotationData)(sIndex: Int, s: Sample, g: Genotype): String = eval()(v, va)(sIndex, s, g) diff --git a/src/main/scala/org/broadinstitute/hail/variant/Sample.scala b/src/main/scala/org/broadinstitute/hail/variant/Sample.scala index 556559096ea..be774e813c5 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/Sample.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/Sample.scala @@ -1,3 +1,5 @@ package org.broadinstitute.hail.variant -case class Sample(id: String) +case class Sample(id: String) { + override def toString: String = id +} diff --git a/src/main/scala/org/broadinstitute/hail/variant/Variant.scala b/src/main/scala/org/broadinstitute/hail/variant/Variant.scala index 69b1f8c7594..30714f45385 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/Variant.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/Variant.scala @@ -35,6 +35,8 @@ case class Variant(contig: String, // Boundaries for build GRCh37: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/ def inParX(pos: Int): Boolean = (60001 <= pos && pos <= 2699520) || (154931044 <= pos && pos <= 155260560) def inParY(pos: Int): Boolean = (10001 <= pos && pos <= 2649520) || ( 59034050 <= pos && pos <= 59363566) + def inParX: Boolean = inParX(start) + def inParY: Boolean = inParY(start) def isHemizygous(sex: Sex.Sex): Boolean = (sex == Sex.Male) && (contig == "X" && !inParX(start)) || (contig == "Y" && !inParY(start)) diff --git a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala index 13cccfe8ce5..02451396d59 100644 --- a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala +++ b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala @@ -36,10 +36,6 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl } } val rsid = vc.getID -// println(s"nFilters=%d".format(filts.length)) -// println("qual=%.2f".format(vc.getPhredScaledQual)) -// println("Filters are: ") -// filts.foreach(println(_)) if (vc.isBiallelic) { val variant = Variant(vc.getContig, vc.getStart, vc.getReference.getBaseString, vc.getAlternateAllele(0).getBaseString) diff --git a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala index 55e26210674..6b7f3cf4cc5 100644 --- a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala @@ -18,10 +18,10 @@ class ExportSuite extends SparkSuite{ val vds = LoadVCF(sc, "src/test/resources/sample.vcf") val state = State("", sc, sqlContext, vds) - SampleQC.run(state, Array("-o" ,"src/test/resources/sample.vcf.sampleQC")) + SampleQC.run(state, Array("-o", "src/test/resources/sample.vcf.sampleQC")) val postSampleQC = SampleQC.run(state, Array("--store")) - ExportSamples.run(postSampleQC, Array("-o" ,"src/test/resources/sample.vcf.exportSamples", "-c", + ExportSamples.run(postSampleQC, Array("-o", "src/test/resources/sample.vcf.exportSamples", "-c", "s.id, sa.qc.nCalled,sa.qc.nNotCalled,sa.qc.nHomRef,sa.qc.nHet,sa.qc.nHomVar,sa.qc.nSNP,sa.qc.nInsertion," + "sa.qc.nDeletion,sa.qc.nSingleton,sa.qc.nTransition,sa.qc.nTransversion,sa.qc.dpMean,sa.qc.dpStDev," + "sa.qc.dpMeanHomRef,sa.qc.dpStDevHomRef,sa.qc.dpMeanHet,sa.qc.dpStDevHet,sa.qc.dpMeanHomVar," + @@ -36,7 +36,7 @@ class ExportSuite extends SparkSuite{ assert(sQcOutput == sExportOutput) - VariantQC.run(state, Array("-o" ,"src/test/resources/sample.vcf.variantQC")) + VariantQC.run(state, Array("-o", "src/test/resources/sample.vcf.variantQC")) val postVariantQC = VariantQC.run(state, Array("--store")) @@ -53,6 +53,5 @@ class ExportSuite extends SparkSuite{ .getLines().toSet assert(vQcOutput == vExportOutput) - } } From ec09ddf3802d75981152d4123acab9d7481c84fd Mon Sep 17 00:00:00 2001 From: tpoterba Date: Thu, 10 Dec 2015 15:17:28 -0500 Subject: [PATCH 05/15] Expand Exporting functionality -- allow exportation of annotation modules as one field or all fields contained --- .../scala/org/broadinstitute/hail/annotations/Annotations.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index 4145202a976..94aa0743523 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -174,4 +174,4 @@ object AnnotationClassBuilder { case _ => "" } } -} \ No newline at end of file +} From d58ba419e852325e113ee824337f6e104d39d413 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Sat, 12 Dec 2015 15:52:44 -0500 Subject: [PATCH 06/15] Fixes from first round of cseed comments are implemented --- .../scala/org/broadinstitute/hail/Utils.scala | 6 + .../hail/annotations/Annotations.scala | 112 +++++------------- .../hail/driver/ExportGenotypes.scala | 22 ++-- .../hail/driver/ExportSamples.scala | 6 +- .../hail/driver/ExportVariants.scala | 6 +- .../hail/driver/FilterGenotypes.scala | 14 +-- .../hail/driver/FilterSamples.scala | 19 +-- .../hail/driver/FilterVariants.scala | 15 +-- .../org/broadinstitute/hail/driver/Main.scala | 4 +- .../broadinstitute/hail/driver/SampleQC.scala | 12 +- .../hail/methods/ExportTSV.scala | 46 +++---- .../broadinstitute/hail/methods/Filter.scala | 13 +- .../broadinstitute/hail/methods/LoadVCF.scala | 4 +- .../hail/variant/VariantMetadata.scala | 11 +- .../hail/vcf/HtsjdkRecordReader.scala | 2 +- .../hail/methods/ExportSuite.scala | 16 +-- 16 files changed, 122 insertions(+), 186 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/Utils.scala b/src/main/scala/org/broadinstitute/hail/Utils.scala index 9a686df2139..e4b2fdcfa64 100644 --- a/src/main/scala/org/broadinstitute/hail/Utils.scala +++ b/src/main/scala/org/broadinstitute/hail/Utils.scala @@ -432,6 +432,12 @@ object Utils { d.formatted("%.4e") } + def writeOption(o: Option[Any], missingValue: String = "NA"): String = o match { + case Some(d: Double) => stringFormatDouble(d) + case Some(x) => x.toString + case None => missingValue + } + // FIXME Would be nice to have a version that averages three runs, perhaps even discarding an initial run. In this case the code block had better be functional! def printTime[T](block: => T) = { val timed = time(block) diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index 94aa0743523..7eb95479018 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -2,107 +2,56 @@ package org.broadinstitute.hail.annotations case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T]) extends Serializable { - def nAttrs: Int = { - var i = 0 - maps.foreach { - case (id, m) => - i += m.size - } - i += vals.size - i - } + def nAttrs: Int = maps.map(_._2.size).sum + vals.size def hasMap(str: String): Boolean = maps.contains(str) - def contains(str: String): Boolean = vals.contains(str.toLowerCase) + def contains(str: String): Boolean = vals.contains(str) def contains(parent: String, str: String): Boolean = hasMap(parent) && maps(parent).contains(str) def get(str: String): Option[T] = vals.get(str) - def get(parent: String, str: String): Option[T] = { - if (!hasMap(parent)) - None - else - maps(parent).get(str) - } + def get(parent: String, str: String): Option[T] = + maps.get(parent).flatMap(_.get(str)) - def getOrElse(parent: String, str: String, default: T): T = { - if (!hasMap(parent) || !contains(parent, str)) - default - else - maps(parent)(str) - } + def getMap(parent: String): Option[Map[String, T]] = maps.get(parent) - def getOrElse(str: String, default: T): T = { - if (!contains(str)) - default - else - vals(str) - } + def addMap(name: String, m: Map[String, T]): Annotations[T] = + Annotations(maps + ((name, m)), vals) - def addMap(name: String, m: Map[String, T]): Annotations[T] = { - Annotations(maps - .-(name) - .+((name, m)), vals) - } + def addMaps(newMaps: Map[String, Map[String, T]]): Annotations[T] = + Annotations(maps ++ newMaps, vals) - def addMaps(newMaps: Map[String, Map[String, T]]): Annotations[T] = { - Annotations(maps - .--(newMaps.keys) - .++(newMaps), vals) - } + def addVal(name: String, mapping: T): Annotations[T] = Annotations(maps, vals + ((name, mapping))) - def addVal(name: String, mapping: T): Annotations[T] = { - Annotations(maps, vals - .-(name) - .+((name, mapping))) - } + def addVals(newVals: Map[String, T]): Annotations[T] = Annotations(maps, vals ++ newVals) - def addVals(newVals: Map[String, T]): Annotations[T] = { - Annotations(maps, vals - .--(newVals.keys) - .++(newVals)) + def ++ (other: Annotations[T]): Annotations[T] = { + new Annotations(maps ++ other.maps, vals ++ other.vals) } } -object EmptyAnnotationSignatures { - def apply(): AnnotationSignatures = { +object Annotations { + def emptyOfSignature(): AnnotationSignatures = Annotations(Map.empty[String, Map[String, AnnotationSignature]], Map.empty[String, AnnotationSignature]) - } -} -object EmptyAnnotations { - def apply(): AnnotationData = { + def emptyOfString(): AnnotationData = Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String]) - } -} -object EmptySampleAnnotations { - def apply(nSamples: Int): IndexedSeq[AnnotationData] = { + def emptyOfArrayString(nSamples: Int): IndexedSeq[AnnotationData] = (0 until nSamples) .map(i => Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String])) - } } object AnnotationUtils { def annotationToString(ar: AnyRef): String = { ar match { - case iter: Iterable[_] => if (iter.isEmpty) "" else iter.map(_.toString).reduceRight(_ + ", " + _) + case iter: Iterable[_] => if (iter.isEmpty) "" else iter.map(_.toString).mkString(", ") case _ => ar.toString } } - - def parseAnnotationType(str: String): String = { - str match { - case "Flag" => "Boolean" - case "Integer" => "Int" - case "Float" => "Double" - case "String" => "String" - case _ => throw new UnsupportedOperationException("unexpected annotation type") - } - } } object AnnotationClassBuilder { @@ -113,7 +62,7 @@ object AnnotationClassBuilder { case (subclass, subMap) => s"class __${subclass}Annotations(subMap: Map[String, String]) extends Serializable {\n" + subMap.map { case (k, sig) => - // s""" val $k: $kType = subMap.getOrElse("$k", \"false\").$kMethod\n""" + // s""" val $k: $kType = subMap.getFromMapOrElse("$k", \"false\").$kMethod\n""" val default = getDefault(sig.getType) s""" val $k: ${sig.getType} = subMap.getOrElse("$k", "$default").${sig.conversion}\n""" } @@ -122,12 +71,14 @@ object AnnotationClassBuilder { val keys = subMap.keys.toArray.sorted " def __fields: Array[String] = Array(" + { if (keys.isEmpty) "" - else keys.map(_ + s""".formatString("$missing")""") - .reduceRight(_ + "," + _) + else keys.map(s => s"""formatString($s, "$missing")""") + .mkString(",") } + ")\n" + - """ override def toString: String = """ + - """if (__fields.length == 0) "" else __fields.reduceRight(_ + ";" + _)""" + "\n" + - """ def all: String = if (__fields.length == 0) "" else __fields.reduceRight(_ + "\t" + _)""" + "\n" + """ override def toString: String = + | if (__fields.length == 0) "" else __fields.mkString(";") + | def all: String = if (__fields.length == 0) "" else __fields.mkString("\t") + | + """.stripMargin } else "" } + @@ -154,10 +105,11 @@ object AnnotationClassBuilder { s"val $exposedName = new ${hiddenClassName}Annotations($hiddenClassName)\n" } - def makeIndexedSeq(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = { - s"val $hiddenOutputName: IndexedSeq[${hiddenClassName}Annotations] = " + - s"$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_))\n" - } + def makeIndexedSeq(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = + s"""val $hiddenOutputName: IndexedSeq[${hiddenClassName}Annotations] + |$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_)) + | + """.stripMargin val arrayRegex = """Array\[(\w+)\]""".r val optionRegex = """Option\[(\w+)\]""".r @@ -172,6 +124,6 @@ object AnnotationClassBuilder { case optionRegex(subType) => "None" case arrayRegex(subType) => getDefault(subType) case _ => "" - } + } } } diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala index 3e70484fdcb..294d82f2a42 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala @@ -36,15 +36,15 @@ object ExportGenotypes extends Command { val output = options.output - val vas: AnnotationSignatures = state.vds.metadata.variantAnnotationSignatures - val sas: AnnotationSignatures = state.vds.metadata.sampleAnnotationSignatures + val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures + val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures val sa = state.vds.metadata.sampleAnnotations - val makeString: IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => + val makeString: ((Variant, AnnotationData) => ((Int, Sample, Genotype) => String)) = try { val cf = new ExportGenotypeEvaluator(options.condition, vas, sas, sa, options.missing) cf.typeCheck() - cf.apply + cf.apply(sa) } catch { case e: scala.tools.reflect.ToolBoxError => @@ -55,10 +55,10 @@ object ExportGenotypes extends Command { fatal("parse error in condition: " + e.message.split("\n").last) } - val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) + val sampleIdsBc = state.sc.broadcast(vds.sampleIds) val stringVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => - makeString(sa)(v, va)(s, Sample(sampleIdsBc.value(s)), g)) + makeString(v, va)(s, Sample(sampleIdsBc.value(s)), g)) // FIXME add additional command parsing functionality val variantRegex = """v\.(\w+)""".r @@ -81,28 +81,28 @@ object ExportGenotypes extends Command { case topLevelSampleAnnoRegex(x) => if (sas.maps.contains(x)) { val keys = sas.maps(x).keys.toArray.sorted - if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) + if (keys.isEmpty) x else s"$x:" + keys.mkString(";") } else x case topLevelVariantAnnoRegex(x) => if (vas.maps.contains(x)) { val keys = vas.maps(x).keys.toArray.sorted - if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) + if (keys.isEmpty) x else s"$x:" + keys.mkString(";") } else x case samplePrintMapRegex(x) => val keys = sas.maps(x).keys - if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) + if (keys.isEmpty) x else keys.mkString("\t") case variantPrintMapRegex(x) => val keys = vas.maps(x).keys - if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) + if (keys.isEmpty) x else keys.mkString("\t") case annoRegex(x) => x case _ => input } } writeTextFile(output + ".header", state.hadoopConf) { s => - s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) + s.write(cond.split(",").map(_.split("\\.").last).mkString("\t")) s.write("\n") } diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala index fa48b732ec8..ad2e67ec83f 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala @@ -66,19 +66,19 @@ object ExportSamples extends Command { case topLevelAnnoRegex(x) => if (sas.maps.contains(x)) { val keys = sas.maps(x).keys.toArray.sorted - if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) + if (keys.isEmpty) x else s"$x:" + keys.mkString(";") } else x case printMapRegex(x) => val keys = sas.maps(x).keys - if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) + if (keys.isEmpty) x else keys.mkString("\t") case annoRegex(x) => x case _ => input } } writeTextFile(output + ".header", state.hadoopConf) { s => - s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) + s.write(cond.split(",").map(_.split("\\.").last).mkString(";")) s.write("\n") } diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala index 5cf2a1f0320..204fb32bd59 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala @@ -66,19 +66,19 @@ object ExportVariants extends Command { case topLevelAnnoRegex(x) => if (vas.maps.contains(x)) { val keys = vas.maps(x).keys.toArray.sorted - if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) + if (keys.isEmpty) x else s"$x:" + keys.mkString("\t") } else x case printMapRegex(x) => val keys = vas.maps(x).keys - if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) + if (keys.isEmpty) x else keys.mkString("\t") case annoRegex(x) => x case _ => input } } writeTextFile(output + ".header", state.hadoopConf) { s => - s.write(cond.split(",").map(mapColumnNames).reduceRight(_ + "\t" + _)) + s.write(cond.split(",").map(mapColumnNames).mkString("\t")) s.write("\n") } diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala index f2565a81ef6..12d779b3da7 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala @@ -35,18 +35,10 @@ object FilterGenotypes extends Command { if (!options.keep && !options.remove) fatal(name + ": one of `--keep' or `--remove' required") - val p: IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean)) = try { + val p: ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean)) = try { val cf = new FilterGenotypeCondition(options.condition, vas, sas, sa) cf.typeCheck() - cf.apply - } - catch { - case e: scala.tools.reflect.ToolBoxError => - /* e.message looks like: - reflective compilation has failed: - - ';' expected but '.' found. */ - fatal("parse error in condition: " + e.message.split("\n").last) + cf.apply(sa) } val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) @@ -54,7 +46,7 @@ object FilterGenotypes extends Command { val localRemove = options.remove //FIXME put keep/remove logic here val newVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => - if (p(sa)(v, va)(s, Sample(sampleIdsBc.value(s)), g)) { + if (p(v, va)(s, Sample(sampleIdsBc.value(s)), g)) { if (localKeep) g else Genotype(-1, (0, 0), 0, null) } else { diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala index b64f1098ed9..3ea5feefc82 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala @@ -46,20 +46,11 @@ object FilterSamples extends Command { .toSet (s: Int, sa: AnnotationData) => samples.contains(s) case c: String => - try { - val cf = new FilterSampleCondition(c, sas) - cf.typeCheck() - - val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) - (s: Int, sa: AnnotationData) => cf(Sample(sampleIdsBc.value(s)), state.vds.metadata.sampleAnnotations(s)) - } catch { - case e: scala.tools.reflect.ToolBoxError => - /* e.message looks like: - reflective compilation has failed: - - ';' expected but '.' found. */ - fatal("parse error in condition: " + e.message.split("\n").last) - } + val cf = new FilterSampleCondition(c, sas) + cf.typeCheck() + + val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) + (s: Int, sa: AnnotationData) => cf(Sample(sampleIdsBc.value(s)), state.vds.metadata.sampleAnnotations(s)) } val newVDS = vds.filterSamples(if (options.keep) diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterVariants.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterVariants.scala index 407032d583b..34eb616d1f3 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterVariants.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterVariants.scala @@ -39,18 +39,9 @@ object FilterVariants extends Command { val ilist = IntervalList.read(options.condition) (v: Variant, va: Annotations[String]) => ilist.contains(v.contig, v.start) case c: String => - try { - val cf = new FilterVariantCondition(c, vas) - cf.typeCheck() - cf.apply - } catch { - case e: scala.tools.reflect.ToolBoxError => - /* e.message looks like: - reflective compilation has failed: - - ';' expected but '.' found. */ - fatal("parse error in condition: " + e.message.split("\n").last) - } + val cf = new FilterVariantCondition(c, vas) + cf.typeCheck() + cf.apply } val newVDS = vds.filterVariants(if (options.keep) diff --git a/src/main/scala/org/broadinstitute/hail/driver/Main.scala b/src/main/scala/org/broadinstitute/hail/driver/Main.scala index 92b38a5015f..5613abfd9a1 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/Main.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/Main.scala @@ -73,7 +73,9 @@ object Main { Repartition, SampleQC, VariantQC, - Write + Write, + ExportVariants, + ExportSamples ) val nameCommand = commands diff --git a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala index edfda8cbc23..6f56517b118 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala @@ -340,9 +340,10 @@ object SampleQC extends Command { val output = options.output + val singletons = sSingletonVariants(vds) + val sampleIdsBc = state.sc.broadcast(vds.sampleIds) + if (options.store) { - val singletons = sSingletonVariants(vds) - val sampleIdsBc = state.sc.broadcast(vds.sampleIds) val r = results(vds).collectAsMap() val newAnnotations = vds.metadata.sampleAnnotations .zipWithIndex @@ -353,18 +354,13 @@ object SampleQC extends Command { sampleAnnotationSignatures = vds.metadata.sampleAnnotationSignatures .addMap("qc", SampleQCCombiner.signatures), sampleAnnotations = newAnnotations))) - } - else { - + } else { writeTextFile(output + ".header", state.hadoopConf) { s => s.write("sampleID\t") s.write(SampleQCCombiner.header) s.write("\n") } - val singletons = sSingletonVariants(vds) - val sampleIdsBc = state.sc.broadcast(vds.sampleIds) - hadoopDelete(output, state.hadoopConf, recursive = true) val r = results(vds) .map { case (s, comb) => diff --git a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala index 890bf6b0707..a79e5e1d0f0 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala @@ -3,34 +3,21 @@ package org.broadinstitute.hail.methods import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ import org.broadinstitute.hail.annotations._ import org.broadinstitute.hail.variant.{Sample, Variant, Genotype} -import org.broadinstitute.hail.Utils.stringFormatDouble +import org.broadinstitute.hail.Utils._ import scala.language.implicitConversions -object Formatter { - def writeOption(o: Option[Any], missingValue: String): String = o match { - case Some(d: Double) => stringFormatDouble(d) - case Some(x) => x.toString - case None => missingValue - } -} - -class Formatter[T](val t: T) extends AnyVal { - def formatString(missingValue: String): String = t match { - case x: Option[Any] => Formatter.writeOption(x, missingValue) - case d: Double => stringFormatDouble(d) - case x: Iterable[Double] => if (x.isEmpty) "" else x.map(stringFormatDouble).reduceRight(_ + "," + _) - case x: Iterable[Any] => if (x.isEmpty) "" else x.map(_.toString).reduceRight(_ + "," + _) - case _ => t.toString - } -} - object ExportUtils { type ExportGenotypeWithSA = (IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => String))) type ExportGenotypePostSA = (Variant, AnnotationData) => ((Int, Sample, Genotype) => String) } object UserExportUtils { - implicit def toFormatter[T](t: T): Formatter[T] = new Formatter(t) + def formatString(a: Any, missingValue: String): String = a match { + case o: Option[Any] => writeOption(o, missingValue) + case d: Double => stringFormatDouble(d) + case i: Iterable[Any] => if (i.isEmpty) "" else i.map(formatString(_, missingValue)).mkString(",") + case _ => a.toString + } class ExportVariant(val v: Variant) extends AnyVal { def contig = v.contig @@ -58,26 +45,26 @@ object UserExportUtils { class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures, missingValue: String) extends Evaluator[(Variant, AnnotationData) => String]({ - val a = "(__v: org.broadinstitute.hail.variant.Variant, \n" + + "(__v: org.broadinstitute.hail.variant.Variant, \n" + "__va: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + "import org.broadinstitute.hail.methods.FilterUtils._\n" + "import org.broadinstitute.hail.methods.UserExportUtils._\n" + "val v: org.broadinstitute.hail.methods.UserExportUtils.ExportVariant = new ExportVariant(__v)\n" + signatures(vas, "__va", makeToString = true, missing = missingValue) + instantiate("va", "__va") + - s"""Array($list).map(_.formatString("$missingValue")).reduceRight(_ + "\t" + _)}: String""";println(a); a}) { + s"""Array($list).map(formatString(_, "$missingValue")).mkString("\t")}: String"""}) { def apply(v: Variant, va: AnnotationData): String = eval()(v, va) } class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures, missingValue: String) extends Evaluator[(Sample, AnnotationData) => String]({ - val a = "(s: org.broadinstitute.hail.variant.Sample, \n" + + "(s: org.broadinstitute.hail.variant.Sample, \n" + "__sa: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + "import org.broadinstitute.hail.methods.FilterUtils._\n" + "import org.broadinstitute.hail.methods.UserExportUtils._\n" + signatures(sas, "__sa", makeToString = true, missing = missingValue) + instantiate("sa", "__sa") + - s"""Array($list).map(_.formatString("$missingValue")).reduceRight(_ + "\t" + _)}: String"""; println(a); a}) { + s"""Array($list).map(formatString(_, "$missingValue")).mkString("\t")}: String"""}) { def apply(s: Sample, sa: AnnotationData): String = eval()(s, sa) } @@ -94,11 +81,12 @@ class ExportGenotypeEvaluator(list: String, vas: AnnotationSignatures, sas: Anno "val v: org.broadinstitute.hail.methods.UserExportUtils.ExportVariant = new ExportVariant(__v)\n" + signatures(vas, "__va", makeToString = true, missing = missingValue) + instantiate("va", "__va") + - "(__sIndex: Int, " + - "s: org.broadinstitute.hail.variant.Sample, " + - "g: org.broadinstitute.hail.variant.Genotype) => {\n" + - "val sa = __saArray(__sIndex)\n" + - s"""Array($list).map(_.formatString("$missingValue")).reduceRight(_ + "\t" + _)}: String}}"""}, t => t(sad)) { + """(__sIndex: Int, + | s: org.broadinstitute.hail.variant.Sample, + | g: org.broadinstitute.hail.variant.Genotype) => { + | val sa = __saArray(__sIndex) + | Array($list).map(formatString(_, "$missingValue")).mkString("\t")}: String}} + """.stripMargin}, t => t(sad)) { def apply(sa: IndexedSeq[AnnotationData]) (v: Variant, va: AnnotationData)(sIndex: Int, s: Sample, g: Genotype): String = eval()(v, va)(sIndex, s, g) diff --git a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala index dc0fc91a850..0b5f34a4f7d 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala @@ -2,6 +2,7 @@ package org.broadinstitute.hail.methods import org.apache.spark.SparkContext import org.broadinstitute.hail.Utils +import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.annotations._ import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ import org.broadinstitute.hail.methods.FilterUtils.{FilterGenotypePostSA, FilterGenotypeWithSA} @@ -69,7 +70,17 @@ class Evaluator[T](t: String)(implicit tct: ClassTag[T]) def typeCheck() { require(p.isEmpty) - p = Some(Utils.eval[T](t)) + try { + p = Some(Utils.eval[T](t)) + } + catch { + case e: scala.tools.reflect.ToolBoxError => + /* e.message looks like: + reflective compilation has failed: + + ';' expected but '.' found. */ + fatal("parse error in condition: " + e.message.split("\n").last) + } } def eval(): T = p match { diff --git a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala index f5f79a75032..8b1cee3b2a9 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala @@ -106,8 +106,8 @@ object LoadVCF { .split("\t") .drop(9) - val sampleAnnotations = EmptySampleAnnotations(sampleIds.length) - val sampleAnnotationSignatures = EmptyAnnotationSignatures() + val sampleAnnotations = Annotations.emptyOfArrayString(sampleIds.length) + val sampleAnnotationSignatures = Annotations.emptyOfSignature() val headerLinesBc = sc.broadcast(headerLines) val genotypes = sc.textFile(file, nPartitions.getOrElse(sc.defaultMinPartitions)) diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala index 72d6b625a8f..ed78b6c3496 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala @@ -5,12 +5,12 @@ import org.broadinstitute.hail.annotations._ object VariantMetadata { def apply(contigLength: Map[String, Int], sampleIds: Array[String]): VariantMetadata = new VariantMetadata(contigLength, sampleIds, None, - EmptySampleAnnotations(sampleIds.length), EmptyAnnotationSignatures(), EmptyAnnotationSignatures()) + Annotations.emptyOfArrayString(sampleIds.length), Annotations.emptyOfSignature(), Annotations.emptyOfSignature()) def apply(contigLength: Map[String, Int], sampleIds: Array[String], vcfHeader: Array[String]): VariantMetadata = new VariantMetadata(contigLength, sampleIds, Some(vcfHeader), - EmptySampleAnnotations(sampleIds.length), EmptyAnnotationSignatures(), EmptyAnnotationSignatures()) + Annotations.emptyOfArrayString(sampleIds.length), Annotations.emptyOfSignature(), Annotations.emptyOfSignature()) def apply(contigLength: Map[String, Int], sampleIds: Array[String], vcfHeader: Array[String], sa: IndexedSeq[AnnotationData], sas: AnnotationSignatures, vas: AnnotationSignatures): VariantMetadata = { @@ -28,4 +28,11 @@ case class VariantMetadata(contigLength: Map[String, Int], def nContigs: Int = contigLength.size def nSamples: Int = sampleIds.length + + def addSampleAnnotations(sas: AnnotationSignatures, sa: IndexedSeq[AnnotationData]): VariantMetadata = { + this.copy( + sampleAnnotationSignatures = this.sampleAnnotationSignatures ++ sas, + sampleAnnotations = this.sampleAnnotations.zip(sa).map { case (a1, a2) => a1 ++ a2 } + ) + } } diff --git a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala index 02451396d59..5b223cadd6a 100644 --- a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala +++ b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala @@ -32,7 +32,7 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl if (vc.getFilters.isEmpty) "" else - vc.getFilters.toArray.map(_.toString).reduceRight(_ + "," + _) + vc.getFilters.toArray.map(_.toString).mkString(",") } } val rsid = vc.getID diff --git a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala index 6b7f3cf4cc5..575fc6a6b48 100644 --- a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala @@ -18,10 +18,10 @@ class ExportSuite extends SparkSuite{ val vds = LoadVCF(sc, "src/test/resources/sample.vcf") val state = State("", sc, sqlContext, vds) - SampleQC.run(state, Array("-o", "src/test/resources/sample.vcf.sampleQC")) + SampleQC.run(state, Array("-o", "/tmp/sampleQC")) val postSampleQC = SampleQC.run(state, Array("--store")) - ExportSamples.run(postSampleQC, Array("-o", "src/test/resources/sample.vcf.exportSamples", "-c", + ExportSamples.run(postSampleQC, Array("-o", "/tmp/exportSamples", "-c", "s.id, sa.qc.nCalled,sa.qc.nNotCalled,sa.qc.nHomRef,sa.qc.nHet,sa.qc.nHomVar,sa.qc.nSNP,sa.qc.nInsertion," + "sa.qc.nDeletion,sa.qc.nSingleton,sa.qc.nTransition,sa.qc.nTransversion,sa.qc.dpMean,sa.qc.dpStDev," + "sa.qc.dpMeanHomRef,sa.qc.dpStDevHomRef,sa.qc.dpMeanHet,sa.qc.dpStDevHet,sa.qc.dpMeanHomVar," + @@ -29,27 +29,27 @@ class ExportSuite extends SparkSuite{ "sa.qc.gqStDevHet,sa.qc.gqMeanHomVar,sa.qc.gqStDevHomVar,sa.qc.nNonRef,sa.qc.rTiTv,sa.qc.rHetHomVar," + "sa.qc.rDeletionInsertion")) - val sQcOutput = Source.fromFile("src/test/resources/sample.vcf.sampleQC/part-00000") + val sQcOutput = Source.fromFile("/tmp/sampleQC/part-00000") .getLines().toSet - val sExportOutput = Source.fromFile("src/test/resources/sample.vcf.exportSamples/part-00000") + val sExportOutput = Source.fromFile("/tmp/exportSamples/part-00000") .getLines().toSet assert(sQcOutput == sExportOutput) - VariantQC.run(state, Array("-o", "src/test/resources/sample.vcf.variantQC")) + VariantQC.run(state, Array("-o", "/tmp/variantQC")) val postVariantQC = VariantQC.run(state, Array("--store")) - ExportVariants.run(postVariantQC, Array("-o", "src/test/resources/sample.vcf.exportVariants", "-c", + ExportVariants.run(postVariantQC, Array("-o", "/tmp/exportVariants", "-c", "v.contig,v.start,v.ref,v.alt,va.qc.nCalled,va.qc.nNotCalled,va.qc.nHomRef,va.qc.nHet,va.qc.nHomVar,va.qc.dpMean,va.qc.dpStDev," + "va.qc.dpMeanHomRef,va.qc.dpStDevHomRef,va.qc.dpMeanHet,va.qc.dpStDevHet,va.qc.dpMeanHomVar," + "va.qc.dpStDevHomVar,va.qc.gqMean,va.qc.gqStDev,va.qc.gqMeanHomRef,va.qc.gqStDevHomRef," + "va.qc.gqMeanHet,va.qc.gqStDevHet,va.qc.gqMeanHomVar,va.qc.gqStDevHomVar,va.qc.MAF,va.qc.nNonRef," + "va.qc.rHeterozygosity,va.qc.rHetHomVar,va.qc.rExpectedHetFrequency,va.qc.pHWE")) - val vQcOutput = Source.fromFile("src/test/resources/sample.vcf.variantQC/part-00000") + val vQcOutput = Source.fromFile("/tmp/variantQC/part-00000") .getLines().toSet - val vExportOutput = Source.fromFile("src/test/resources/sample.vcf.exportVariants/part-00000") + val vExportOutput = Source.fromFile("/tmp/exportVariants/part-00000") .getLines().toSet assert(vQcOutput == vExportOutput) From a0c46fe211e4b8ba517c8b9bccb34bb073512406 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Sat, 12 Dec 2015 15:59:54 -0500 Subject: [PATCH 07/15] Fixes from first round of cseed comments are implemented --- .../scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala b/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala index 85f44e7c5f5..b202236a188 100644 --- a/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala +++ b/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala @@ -119,7 +119,7 @@ object TestRDDBuilder { b += Genotype(gt, ad, dp, pl) } - (variant, EmptyAnnotations(), b.result(): Iterable[Genotype]) + (variant, Annotations.emptyOfString(), b.result(): Iterable[Genotype]) } VariantSampleMatrix(VariantMetadata(Map("1" -> 1000000), sampleList), streamRDD) } From 1b0604417afe9ccf34c5b47efc82bdeb7419d890 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Sat, 12 Dec 2015 21:10:23 -0500 Subject: [PATCH 08/15] Fixes from first round of cseed comments are implemented --- .../hail/annotations/Annotations.scala | 2 +- .../hail/driver/ExportGenotypes.scala | 32 +++---- .../hail/driver/FilterGenotypes.scala | 31 +++---- .../hail/methods/ExportTSV.scala | 88 ++++++++++-------- .../broadinstitute/hail/methods/Filter.scala | 89 +++++++++++-------- .../hail/variant/VariantSampleMatrix.scala | 10 +++ 6 files changed, 141 insertions(+), 111 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index 7eb95479018..b735a2946fe 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -106,7 +106,7 @@ object AnnotationClassBuilder { } def makeIndexedSeq(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = - s"""val $hiddenOutputName: IndexedSeq[${hiddenClassName}Annotations] + s"""val $hiddenOutputName: IndexedSeq[${hiddenClassName}Annotations] = |$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_)) | """.stripMargin diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala index 294d82f2a42..33e31cfa121 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala @@ -31,37 +31,29 @@ object ExportGenotypes extends Command { def run(state: State, options: Options): State = { val vds = state.vds - val cond = options.condition - val output = options.output val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures - val sa = state.vds.metadata.sampleAnnotations + val sa = vds.metadata.sampleAnnotations + val ids = vds.sampleIds val makeString: ((Variant, AnnotationData) => - ((Int, Sample, Genotype) => String)) = try { - val cf = new ExportGenotypeEvaluator(options.condition, vas, sas, sa, options.missing) + ((Int, Genotype) => String)) = { + val cf = new ExportGenotypeEvaluator(options.condition, vas, sas, sa, ids, options.missing) cf.typeCheck() - cf.apply(sa) + cf.apply } - catch { - case e: scala.tools.reflect.ToolBoxError => - /* e.message looks like: - reflective compilation has failed: - - ';' expected but '.' found. */ - fatal("parse error in condition: " + e.message.split("\n").last) - } - - val sampleIdsBc = state.sc.broadcast(vds.sampleIds) - val stringVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => - makeString(v, va)(s, Sample(sampleIdsBc.value(s)), g)) + val stringVDS = vds.mapValuesWithPartialApplication( + (v: Variant, va: AnnotationData) => + (s: Int, g: Genotype) => + makeString(v, va)(s, g)) // FIXME add additional command parsing functionality - val variantRegex = """v\.(\w+)""".r + val variantRegex = + """v\.(\w+)""".r val sampleRegex = """s\.(\w+)""".r val topLevelSampleAnnoRegex = """sa\.(\w+)""".r val topLevelVariantAnnoRegex = """va\.(\w+)""".r @@ -109,7 +101,7 @@ object ExportGenotypes extends Command { hadoopDelete(output, state.hadoopConf, recursive = true) stringVDS.rdd - .flatMap { case (v, va, strings) => strings} + .flatMap { case (v, va, strings) => strings } .saveAsTextFile(output) state diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala index 12d779b3da7..f322f33edb4 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala @@ -3,7 +3,7 @@ package org.broadinstitute.hail.driver import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.methods._ import org.broadinstitute.hail.annotations._ -import org.broadinstitute.hail.variant.{Variant, Genotype, Sample} +import org.broadinstitute.hail.variant.{VariantDataset, Variant, Genotype, Sample} import org.kohsuke.args4j.{Option => Args4jOption} object FilterGenotypes extends Command { @@ -28,30 +28,31 @@ object FilterGenotypes extends Command { def run(state: State, options: Options): State = { val vds = state.vds - val vas: AnnotationSignatures = state.vds.metadata.variantAnnotationSignatures - val sas: AnnotationSignatures = state.vds.metadata.sampleAnnotationSignatures - val sa = state.vds.metadata.sampleAnnotations + val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures + val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures + val ids = vds.sampleIds + val sa = vds.metadata.sampleAnnotations if (!options.keep && !options.remove) fatal(name + ": one of `--keep' or `--remove' required") - val p: ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean)) = try { - val cf = new FilterGenotypeCondition(options.condition, vas, sas, sa) + val p: ((Variant, AnnotationData) => ((Int, Genotype) => Boolean)) = { + val cf = new FilterGenotypeCondition(options.condition, vas, sas, sa, ids) cf.typeCheck() - cf.apply(sa) + cf.apply } - val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) val localKeep = options.keep val localRemove = options.remove //FIXME put keep/remove logic here - val newVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => - if (p(v, va)(s, Sample(sampleIdsBc.value(s)), g)) { - if (localKeep) g else Genotype(-1, (0, 0), 0, null) - } - else { - if (localRemove) g else Genotype(-1, (0, 0), 0, null) - }) + val newVDS = vds.mapValuesWithPartialApplication( + (v: Variant, va: AnnotationData) => + (s: Int, g: Genotype) => + if (p(v, va)(s, g)) { + if (localKeep) g else Genotype(-1, (0, 0), 0, null) + } else { + if (localRemove) g else Genotype(-1, (0, 0), 0, null) + }) state.copy(vds = newVDS) } } diff --git a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala index a79e5e1d0f0..8076ba7fb63 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala @@ -2,13 +2,15 @@ package org.broadinstitute.hail.methods import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ import org.broadinstitute.hail.annotations._ +import org.broadinstitute.hail.methods.ExportUtils.{ExportGenotypePostSA, ExportGenotypeWithSA} import org.broadinstitute.hail.variant.{Sample, Variant, Genotype} import org.broadinstitute.hail.Utils._ import scala.language.implicitConversions object ExportUtils { - type ExportGenotypeWithSA = (IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => String))) - type ExportGenotypePostSA = (Variant, AnnotationData) => ((Int, Sample, Genotype) => String) + type ExportGenotypeWithSA = ((IndexedSeq[AnnotationData], IndexedSeq[String]) => + ((Variant, AnnotationData) => ((Int, Genotype) => String))) + type ExportGenotypePostSA = (Variant, AnnotationData) => ((Int, Genotype) => String) } object UserExportUtils { @@ -45,49 +47,57 @@ object UserExportUtils { class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures, missingValue: String) extends Evaluator[(Variant, AnnotationData) => String]({ - "(__v: org.broadinstitute.hail.variant.Variant, \n" + - "__va: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + - "import org.broadinstitute.hail.methods.FilterUtils._\n" + - "import org.broadinstitute.hail.methods.UserExportUtils._\n" + - "val v: org.broadinstitute.hail.methods.UserExportUtils.ExportVariant = new ExportVariant(__v)\n" + - signatures(vas, "__va", makeToString = true, missing = missingValue) + - instantiate("va", "__va") + - s"""Array($list).map(formatString(_, "$missingValue")).mkString("\t")}: String"""}) { + s"""(__v: org.broadinstitute.hail.variant.Variant, + | __va: org.broadinstitute.hail.annotations.AnnotationData) => { + | import org.broadinstitute.hail.methods.FilterUtils._ + | import org.broadinstitute.hail.methods.UserExportUtils._ + | val v: ExportVariant = new ExportVariant(__v) + | ${signatures(vas, "__va", makeToString = true, missing = missingValue)} + | ${instantiate("va", "__va")} + | Array($list).map(formatString(_, "$missingValue")).mkString("\t") + |}: String + """.stripMargin}) { def apply(v: Variant, va: AnnotationData): String = eval()(v, va) } class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures, missingValue: String) - extends Evaluator[(Sample, AnnotationData) => String]({ - "(s: org.broadinstitute.hail.variant.Sample, \n" + - "__sa: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + - "import org.broadinstitute.hail.methods.FilterUtils._\n" + - "import org.broadinstitute.hail.methods.UserExportUtils._\n" + - signatures(sas, "__sa", makeToString = true, missing = missingValue) + - instantiate("sa", "__sa") + - s"""Array($list).map(formatString(_, "$missingValue")).mkString("\t")}: String"""}) { + extends Evaluator[(Sample, AnnotationData) => String]( + s"""(s: org.broadinstitute.hail.variant.Sample, + | __sa: org.broadinstitute.hail.annotations.AnnotationData) => { + | import org.broadinstitute.hail.methods.FilterUtils._ + | import org.broadinstitute.hail.methods.UserExportUtils._ + | ${signatures(sas, "__sa", makeToString = true, missing = missingValue)} + | ${instantiate("sa", "__sa")} + | Array($list).map(formatString(_, "$missingValue")).mkString("\t") + |}: String + """.stripMargin) { def apply(s: Sample, sa: AnnotationData): String = eval()(s, sa) } class ExportGenotypeEvaluator(list: String, vas: AnnotationSignatures, sas: AnnotationSignatures, - sad: IndexedSeq[AnnotationData], missingValue: String) - extends EvaluatorWithTransformation[ExportUtils.ExportGenotypeWithSA, ExportUtils.ExportGenotypePostSA]( - {"(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData]) => {\n" + - "import org.broadinstitute.hail.methods.FilterUtils._\n" + - "import org.broadinstitute.hail.methods.UserExportUtils._\n" + - signatures(sas, "__sa") + - makeIndexedSeq("__saArray", "__sa", "__sa") + - "(__v: org.broadinstitute.hail.variant.Variant, " + - "__va: org.broadinstitute.hail.annotations.AnnotationData) => {\n" + - "val v: org.broadinstitute.hail.methods.UserExportUtils.ExportVariant = new ExportVariant(__v)\n" + - signatures(vas, "__va", makeToString = true, missing = missingValue) + - instantiate("va", "__va") + - """(__sIndex: Int, - | s: org.broadinstitute.hail.variant.Sample, - | g: org.broadinstitute.hail.variant.Genotype) => { - | val sa = __saArray(__sIndex) - | Array($list).map(formatString(_, "$missingValue")).mkString("\t")}: String}} - """.stripMargin}, t => t(sad)) { - def apply(sa: IndexedSeq[AnnotationData]) - (v: Variant, va: AnnotationData)(sIndex: Int, s: Sample, g: Genotype): String = - eval()(v, va)(sIndex, s, g) + sad: IndexedSeq[AnnotationData], ids: IndexedSeq[String], missingValue: String) + extends EvaluatorWithTransformation[ExportGenotypeWithSA, ExportGenotypePostSA]( + s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], + | __ids: IndexedSeq[String]) => { + | import org.broadinstitute.hail.methods.FilterUtils._ + | import org.broadinstitute.hail.methods.UserExportUtils._ + | ${signatures(sas, "__sa")} + | ${makeIndexedSeq("__saArray", "__sa", "__sa")} + | (__v: org.broadinstitute.hail.variant.Variant, + | __va: org.broadinstitute.hail.annotations.AnnotationData) => { + | val v = new ExportVariant(__v) + | ${signatures(vas, "__va")} + | ${instantiate("va", "__va")} + | (__sIndex: Int, + | g: org.broadinstitute.hail.variant.Genotype) => { + | val sa = __saArray(__sIndex) + | val s = org.broadinstitute.hail.variant.Sample(__ids(__sIndex)) + | Array($list).map(formatString(_, "$missingValue")).mkString("\t") + | }: String + | } + | } + """.stripMargin, + t => t(sad, ids)) { + def apply(v: Variant, va: AnnotationData)(sIndex: Int, g: Genotype): String = + eval()(v, va)(sIndex, g) } \ No newline at end of file diff --git a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala index 0b5f34a4f7d..21dd572dbfe 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala @@ -1,6 +1,5 @@ package org.broadinstitute.hail.methods -import org.apache.spark.SparkContext import org.broadinstitute.hail.Utils import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.annotations._ @@ -22,13 +21,18 @@ object ConvertibleString { class ConvertibleString(val s: String) extends AnyVal { def toArrayInt: Array[Int] = s.split(",").map(i => i.toInt) + def toArrayDouble: Array[Double] = s.split(",").map(i => i.toDouble) + def toSetString: Set[String] = s.split(",").toSet + def toStupidAnnotation: Array[Array[String]] = s.split(",").map(_.split("|").map(_.trim)) + def toOptionInt: Option[Int] = s match { case ConvertibleString.someRegex(i) => Some(i.toInt) case "None" => None } + def toOptionDouble: Option[Double] = s match { case ConvertibleString.someRegex(i) => Some(i.toDouble) case "None" => None @@ -36,21 +40,23 @@ class ConvertibleString(val s: String) extends AnyVal { } object FilterUtils { - type FilterGenotypeWithSA = (IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean))) - type FilterGenotypePostSA = (Variant, AnnotationData) => ((Int, Sample, Genotype) => Boolean) + type FilterGenotypeWithSA = ((IndexedSeq[AnnotationData], IndexedSeq[String]) => + ((Variant, AnnotationData) => ((Int, Genotype) => Boolean))) + type FilterGenotypePostSA = (Variant, AnnotationData) => ((Int, Genotype) => Boolean) + implicit def toFilterString(s: String): FilterString = new FilterString(s) implicit def toConvertibleString(s: String): ConvertibleString = new ConvertibleString(s) -// def test(): (Variant, Annotations[String]) => Boolean = { -// throw new UnsupportedOperationException -// } + // def test(): (Variant, Annotations[String]) => Boolean = { + // throw new UnsupportedOperationException + // } } class EvaluatorWithTransformation[T, S](t: String, f: T => S)(implicit tct: ClassTag[T]) extends Serializable { - @transient var p: Option[S] = None + @transient var p: Option[S] = None - def typeCheck() { + def typeCheck() { require(p.isEmpty) p = Some(f(Utils.eval[T](t))) } @@ -94,42 +100,53 @@ class Evaluator[T](t: String)(implicit tct: ClassTag[T]) class FilterVariantCondition(cond: String, vas: AnnotationSignatures) extends Evaluator[(Variant, AnnotationData) => Boolean]({ - "(v: org.broadinstitute.hail.variant.Variant, \n" + - "__va: org.broadinstitute.hail.annotations.AnnotationData) => { \n" + - "import org.broadinstitute.hail.methods.FilterUtils._; \n" + - signatures(vas, "__va") + - instantiate("va", "__va") + - cond + " }: Boolean"}) { + s"""(v: org.broadinstitute.hail.variant.Variant, + | __va: org.broadinstitute.hail.annotations.AnnotationData) => { + | import org.broadinstitute.hail.methods.FilterUtils._ + | ${signatures(vas, "__va")} + | ${instantiate("va", "__va")} + | $cond + |}: Boolean + """.stripMargin + }) { def apply(v: Variant, va: AnnotationData): Boolean = eval()(v, va) } class FilterSampleCondition(cond: String, sas: AnnotationSignatures) extends Evaluator[(Sample, AnnotationData) => Boolean]( - "(s: org.broadinstitute.hail.variant.Sample, \n" + - "__sa: org.broadinstitute.hail.annotations.AnnotationData) => { " + - "import org.broadinstitute.hail.methods.FilterUtils._; " + - signatures(sas, "__sa") + - instantiate("sa", "__sa") + - cond + " }: Boolean") { + s"""(s: org.broadinstitute.hail.variant.Sample, + | __sa: org.broadinstitute.hail.annotations.AnnotationData) => { + | import org.broadinstitute.hail.methods.FilterUtils._ + | ${signatures(sas, "__sa")} + | ${instantiate("sa", "__sa")} + | $cond + |}: Boolean + """.stripMargin) { def apply(s: Sample, sa: AnnotationData): Boolean = eval()(s, sa) } class FilterGenotypeCondition(cond: String, vas: AnnotationSignatures, sas: AnnotationSignatures, - sad: IndexedSeq[AnnotationData]) + sad: IndexedSeq[AnnotationData], ids: IndexedSeq[String]) extends EvaluatorWithTransformation[FilterGenotypeWithSA, FilterGenotypePostSA]( - {"(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData]) => {\n" + - "import org.broadinstitute.hail.methods.FilterUtils._\n" + - signatures(sas, "__sa") + - makeIndexedSeq("__saArray", "__sa", "__sa") + - "(v: org.broadinstitute.hail.variant.Variant, " + - "__va: org.broadinstitute.hail.annotations.AnnotationData) => {\n" + - signatures(vas, "__va") + - instantiate("va", "__va") + - "(__sIndex: Int, " + - "s: org.broadinstitute.hail.variant.Sample, " + - "g: org.broadinstitute.hail.variant.Genotype) => {\n" + - "val sa = __saArray(__sIndex)\n" + - cond + " }: Boolean}}"}, t => t(sad)) { - def apply(sa: IndexedSeq[AnnotationData])(v: Variant, va: AnnotationData)(sIndex: Int, s: Sample, g: Genotype): Boolean = - eval()(v, va)(sIndex, s, g) + s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], + | __ids: IndexedSeq[String]) => { + | import org.broadinstitute.hail.methods.FilterUtils._ + | ${signatures(sas, "__sa")} + | ${makeIndexedSeq("__saArray", "__sa", "__sa")} + | (v: org.broadinstitute.hail.variant.Variant, + | __va: org.broadinstitute.hail.annotations.AnnotationData) => { + | ${signatures(vas, "__va")} + | ${instantiate("va", "__va")} + | (__sIndex: Int, + | g: org.broadinstitute.hail.variant.Genotype) => { + | val sa = __saArray(__sIndex) + | val s = org.broadinstitute.hail.variant.Sample(__ids(__sIndex)) + | $cond + | }: Boolean + | } + | } + """.stripMargin, + t => t(sad, ids)) { + def apply(v: Variant, va: AnnotationData)(sIndex: Int, g: Genotype): Boolean = + eval()(v, va)(sIndex, g) } diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala index 1f05b931f25..c1320e4e08d 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala @@ -98,6 +98,16 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, }) } + def mapValuesWithPartialApplication[U](f: (Variant, AnnotationData) => ((Int, T) => U)) + (implicit utt: TypeTag[U], uct: ClassTag[U]): VariantSampleMatrix[U] = { + val localSamplesBc = sparkContext.broadcast(localSamples) + copy(rdd = rdd.map { case (v, va, gs) => + val fPrime = f(v, va) + (v, va, localSamplesBc.value.view.zip(gs.view) + .map { case (s, t) => fPrime(s, t) }) + }) + } + def map[U](f: T => U)(implicit uct: ClassTag[U]): RDD[U] = mapWithKeys((v, s, g) => f(g)) From f3ff63d161be52906c047b159831d20a0b440a37 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Wed, 16 Dec 2015 21:46:32 -0500 Subject: [PATCH 09/15] second round cseed fix intermediate checkpoint --- .../annotations/AnnotationSignature.scala | 6 +- .../hail/annotations/Annotations.scala | 88 +++++++++++-------- .../hail/annotations/SimpleSignature.scala | 11 --- .../hail/annotations/VCFSignature.scala | 49 +++++++++++ .../broadinstitute/hail/driver/SampleQC.scala | 62 ++++++------- .../hail/driver/VariantQC.scala | 54 ++++++------ .../broadinstitute/hail/methods/LoadVCF.scala | 63 ++++--------- .../hail/annotations/AnnotationsSuite.scala | 16 ++-- 8 files changed, 183 insertions(+), 166 deletions(-) delete mode 100644 src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala create mode 100644 src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala diff --git a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala index e71b4a20ebd..3eff7ff2fbd 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala @@ -1,8 +1,8 @@ package org.broadinstitute.hail.annotations abstract class AnnotationSignature { - def buildCaseClasses: String - def conversion: String - def getType: String + def emitUtilities: String + def emitConversionIdentifier: String + def emitType: String } diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index b735a2946fe..ca95df04b23 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -6,13 +6,13 @@ case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T def hasMap(str: String): Boolean = maps.contains(str) - def contains(str: String): Boolean = vals.contains(str) + def containsVal(str: String): Boolean = vals.contains(str) - def contains(parent: String, str: String): Boolean = hasMap(parent) && maps(parent).contains(str) + def containsInMap(parent: String, str: String): Boolean = hasMap(parent) && maps(parent).contains(str) - def get(str: String): Option[T] = vals.get(str) + def getVal(str: String): Option[T] = vals.get(str) - def get(parent: String, str: String): Option[T] = + def getInMap(parent: String, str: String): Option[T] = maps.get(parent).flatMap(_.get(str)) def getMap(parent: String): Option[Map[String, T]] = maps.get(parent) @@ -27,12 +27,17 @@ case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T def addVals(newVals: Map[String, T]): Annotations[T] = Annotations(maps, vals ++ newVals) - def ++ (other: Annotations[T]): Annotations[T] = { + def ++(other: Annotations[T]): Annotations[T] = { new Annotations(maps ++ other.maps, vals ++ other.vals) } } object Annotations { + + def emptyOf[T](): Annotations[T] = { + Annotations(Map.empty[String, Map[String, T]], Map.empty[String, T]) + } + def emptyOfSignature(): AnnotationSignatures = Annotations(Map.empty[String, Map[String, AnnotationSignature]], Map.empty[String, AnnotationSignature]) @@ -60,45 +65,50 @@ object AnnotationClassBuilder { makeToString: Boolean = false, missing: String = ""): String = { val internalClasses = sigs.maps.map { case (subclass, subMap) => - s"class __${subclass}Annotations(subMap: Map[String, String]) extends Serializable {\n" + - subMap.map { case (k, sig) => - // s""" val $k: $kType = subMap.getFromMapOrElse("$k", \"false\").$kMethod\n""" - val default = getDefault(sig.getType) - s""" val $k: ${sig.getType} = subMap.getOrElse("$k", "$default").${sig.conversion}\n""" + val attrs = subMap + .map { case (k, sig) => + s""" val $k: Option[${sig.emitType}] = subMap.get("$k").${sig.emitConversionIdentifier}""" } - .foldRight[String]("")(_ + _) + { + .mkString("\n") + val methods: String = { if (makeToString) { - val keys = subMap.keys.toArray.sorted - " def __fields: Array[String] = Array(" + { - if (keys.isEmpty) "" - else keys.map(s => s"""formatString($s, "$missing")""") - .mkString(",") - } + ")\n" + - """ override def toString: String = - | if (__fields.length == 0) "" else __fields.mkString(";") - | def all: String = if (__fields.length == 0) "" else __fields.mkString("\t") - | + s""" def __fields: Array[String] = Array( + | ${subMap.keys.toArray.sorted.map(s => s"""formatString($s, "$missing")""").mkString(",")} + | ) + | override def toString: String = __fields.mkString(";") + | def all: String = __fields.mkString("\t") """.stripMargin - } - else "" - } + - "}\n" + } else "" + } + s"""class __${subclass}Annotations(subMap: Map[String, String]) extends Serializable { + |$attrs + |$methods + |} + |""".stripMargin } .foldRight[String]("")(_ + _) - val hiddenClass = s"class ${hiddenClassName}Annotations" + - s"(annot: org.broadinstitute.hail.annotations.AnnotationData) extends Serializable {\n" + - sigs.maps.map { case (subclass, subMap) => - s""" val $subclass = new __${subclass}Annotations(annot.maps(\"$subclass\"))\n""" + val hiddenClass = { + val classes = + sigs.maps.map { case (subclass, subMap) => + s""" val $subclass = new __${subclass}Annotations(annot.maps("$subclass"))""" + } + .mkString("\n") + val vals = sigs.vals.map { case (k, sig) => + s""" val $k: Option[${sig.emitType}] = annot.getVal("$k").${sig.emitConversionIdentifier}""" } - .foldRight[String]("")(_ + _) + - sigs.vals.map { case (k, sig) => - val default = getDefault(sig.getType) - s""" val $k: ${sig.getType} = annot.vals.getOrElse("$k", "$default").${sig.conversion} \n""" - } - .foldRight[String]("")(_ + _) + "}\n" + .mkString("\n") + s"""class ${hiddenClassName}Annotations(annot: org.broadinstitute.hail.annotations.AnnotationData) + | extends Serializable { + | $classes + | $vals + |""".stripMargin + } - "\n" + internalClasses + hiddenClass + s""" + |$internalClasses + |$hiddenClass + """.stripMargin } def instantiate(exposedName: String, hiddenClassName: String): String = { @@ -107,8 +117,8 @@ object AnnotationClassBuilder { def makeIndexedSeq(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = s"""val $hiddenOutputName: IndexedSeq[${hiddenClassName}Annotations] = - |$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_)) - | + |$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_)) + | """.stripMargin val arrayRegex = """Array\[(\w+)\]""".r @@ -124,6 +134,6 @@ object AnnotationClassBuilder { case optionRegex(subType) => "None" case arrayRegex(subType) => getDefault(subType) case _ => "" - } + } } } diff --git a/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala deleted file mode 100644 index e2ec601c691..00000000000 --- a/src/main/scala/org/broadinstitute/hail/annotations/SimpleSignature.scala +++ /dev/null @@ -1,11 +0,0 @@ -package org.broadinstitute.hail.annotations - -case class SimpleSignature(scalaType: String, conversionMethod: String, description: String) - extends AnnotationSignature { - - def buildCaseClasses: String = "" - - def conversion: String = conversionMethod - - def getType: String = scalaType -} diff --git a/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala new file mode 100644 index 00000000000..e8c4cd7ba1d --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala @@ -0,0 +1,49 @@ +package org.broadinstitute.hail.annotations + +case class VCFSignature(vcfType: String, emitType: String, number: String, + emitConversionIdentifier: String, description: String) + extends AnnotationSignature { + + def emitUtilities: String = "" +} + +object VCFSignature { + + val arrayRegex = """Array\[(\w+)\]""".r + val setRegex = """Set\[(\w+)\]""".r + def getConversionMethod(str: String): String = { + str match { + case arrayRegex(subType) => s"toArray$subType" + case setRegex(subType) => s"toSet$subType" + case _ => s"to$str" + } + } + + def vcfTypeToScala(str: String): String = + str match { + case "Flag" => "Boolean" + case "Integer" => "Int" + case "Float" => "Double" + case "String" => "String" + case "Character" => "Character" + case "." => "String" + case _ => throw new UnsupportedOperationException("unexpected annotation type") + } + + def parse(number: String, vcfType: String, desc: String): AnnotationSignature = { + val parsedType: String = vcfTypeToScala(vcfType) + + val scalaType: String = { + if (number == "0" || number == "1") { + parsedType + } + else if (number == "A" || number == "R" || number == "G") { + s"Array[$parsedType]" + } + else + throw new UnsupportedOperationException + } + val conversionMethod = getConversionMethod(scalaType) + new VCFSignature(vcfType, scalaType, number, conversionMethod, desc) + } +} \ No newline at end of file diff --git a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala index 6f56517b118..a4489dec780 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala @@ -36,37 +36,37 @@ object SampleQCCombiner { "rHetHomVar\t" + "rDeletionInsertion" - val signatures = Map("nCalled" -> new SimpleSignature("Int", "toInt", ""), - "nNotCalled" -> new SimpleSignature("Int", "toInt", ""), - "nHomRef" -> new SimpleSignature("Int", "toInt", ""), - "nHet" -> new SimpleSignature("Int", "toInt", ""), - "nHomVar" -> new SimpleSignature("Int", "toInt", ""), - "nSNP" -> new SimpleSignature("Int", "toInt", ""), - "nInsertion" -> new SimpleSignature("Int", "toInt", ""), - "nDeletion" -> new SimpleSignature("Int", "toInt", ""), - "nSingleton" -> new SimpleSignature("Int", "toInt", ""), - "nTransition" -> new SimpleSignature("Int", "toInt", ""), - "nTransversion" -> new SimpleSignature("Int", "toInt", ""), - "dpMean" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpStDev" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqMean" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqStDev" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "nNonRef" -> new SimpleSignature("Int", "toInt", ""), - "rTiTv" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "rHetHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "rDeletionInsertion" -> new SimpleSignature("Option[Double]", "toOptionDouble", "")) + val signatures = Map("nCalled" -> new VCFSignature("Int", "toInt", ""), + "nNotCalled" -> new VCFSignature("Int", "toInt", ""), + "nHomRef" -> new VCFSignature("Int", "toInt", ""), + "nHet" -> new VCFSignature("Int", "toInt", ""), + "nHomVar" -> new VCFSignature("Int", "toInt", ""), + "nSNP" -> new VCFSignature("Int", "toInt", ""), + "nInsertion" -> new VCFSignature("Int", "toInt", ""), + "nDeletion" -> new VCFSignature("Int", "toInt", ""), + "nSingleton" -> new VCFSignature("Int", "toInt", ""), + "nTransition" -> new VCFSignature("Int", "toInt", ""), + "nTransversion" -> new VCFSignature("Int", "toInt", ""), + "dpMean" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpStDev" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqMean" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqStDev" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "nNonRef" -> new VCFSignature("Int", "toInt", ""), + "rTiTv" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "rHetHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "rDeletionInsertion" -> new VCFSignature("Option[Double]", "toOptionDouble", "")) } class SampleQCCombiner extends Serializable { diff --git a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala index 158ba3aaaf9..e2fc7a4a055 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala @@ -31,33 +31,33 @@ object VariantQCCombiner { "rHetHomVar\t" + "rExpectedHetFrequency\tpHWE\t" - val signatures = Map("nCalled" -> new SimpleSignature("Int", "toInt", ""), - "nNotCalled" -> new SimpleSignature("Int", "toInt", ""), - "nHomRef" -> new SimpleSignature("Int", "toInt", ""), - "nHet" -> new SimpleSignature("Int", "toInt", ""), - "nHomVar" -> new SimpleSignature("Int", "toInt", ""), - "dpMean" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpStDev" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqMean" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqStDev" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHomRef" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHet" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "MAF" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "nNonRef" -> new SimpleSignature("Int", "toInt", ""), - "rHeterozygosity" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "rHetHomVar" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "rExpectedHetFrequency" -> new SimpleSignature("Option[Double]", "toOptionDouble", ""), - "pHWE" -> new SimpleSignature("Double", "toDouble", "")) + val signatures = Map("nCalled" -> new VCFSignature("Int", "toInt", ""), + "nNotCalled" -> new VCFSignature("Int", "toInt", ""), + "nHomRef" -> new VCFSignature("Int", "toInt", ""), + "nHet" -> new VCFSignature("Int", "toInt", ""), + "nHomVar" -> new VCFSignature("Int", "toInt", ""), + "dpMean" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpStDev" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpMeanHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpStDevHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqMean" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqStDev" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqMeanHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "gqStDevHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "MAF" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "nNonRef" -> new VCFSignature("Int", "toInt", ""), + "rHeterozygosity" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "rHetHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "rExpectedHetFrequency" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "pHWE" -> new VCFSignature("Double", "toDouble", "")) } class VariantQCCombiner extends Serializable { diff --git a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala index 8b1cee3b2a9..f8328075ebe 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala @@ -1,52 +1,17 @@ package org.broadinstitute.hail.methods +import org.broadinstitute.hail.vcf.BufferedLineIterator + import scala.io.Source import org.apache.spark.{SparkConf, SparkContext} import org.broadinstitute.hail.variant._ import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.vcf import org.broadinstitute.hail.annotations._ +import scala.collection.convert._ object LoadVCF { // FIXME move to VariantDataset - - val arrayRegex = """Array\[(\w+)\]""".r - val setRegex = """Set\[(\w+)\]""".r - def getConversionMethod(str: String): String = { - str match { - case arrayRegex(subType) => s"toArray$subType" - case setRegex(subType) => s"toSet$subType" - case _ => s"to$str" - } - } - - def parseInfoType(str: String): String = { - str match { - case "Flag" => "Boolean" - case "Integer" => "Int" - case "Float" => "Double" - case "String" => "String" - case "Character" => "String" - case _ => throw new UnsupportedOperationException("unexpected annotation type") - } - } - - def parseInfoLine(number: String, typeOf: String, desc: String): AnnotationSignature = { - val parsedType = parseInfoType(typeOf) - if (number == "0" || number == "1") { - new SimpleSignature(parsedType, getConversionMethod(parsedType), desc) - } - else if (number == "A" || number == "R" || number == "G") { - val arrType = s"Array[$parsedType]" - new SimpleSignature(arrType, getConversionMethod(arrType), desc) - } - else if (number == "." && parsedType == "String") { - new SimpleSignature(parsedType, getConversionMethod(parsedType), desc) - } - else - throw new UnsupportedOperationException - } - def apply(sc: SparkContext, file: String, compress: Boolean = true, @@ -64,6 +29,11 @@ object LoadVCF { .toArray } + val codec = new htsjdk.variant.vcf.VCFCodec() + val header = codec.readHeader(new BufferedLineIterator(headerLines.iterator.buffered)) + .asInstanceOf[htsjdk.variant.vcf.VCFHeader] + + val contigs = header.getContigLines. val contigRegex ="""##contig=""".r val contigLengths = { val contigMap = headerLines.map { @@ -81,11 +51,10 @@ object LoadVCF { val annoRegex = """##INFO=""".r val annotationTypes = { - val annotationMap = headerLines.map { - case annoRegex(id, number, typeOf, desc) => Some(id, parseInfoLine(number, typeOf, desc)) + val annotationMap = headerLines.flatMap { + case annoRegex(id, number, typeOf, desc) => Some(id, VCFSignature.parse(number, typeOf, desc)) case _ => None - }.flatMap(i => i) - .toMap + }.toMap if (annotationMap.nonEmpty) annotationMap @@ -93,11 +62,11 @@ object LoadVCF { Map.empty[String, AnnotationSignature] } val annotationSignatures: AnnotationSignatures = Annotations[AnnotationSignature](Map("info" -> annotationTypes), - Map("filters" -> new SimpleSignature("Set[String]", "toSetString", "filters applied to site"), - "pass" -> new SimpleSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"), - "multiallelic" -> new SimpleSignature("Boolean", "toBoolean", "Site is a split multiallelic"), - "qual" -> new SimpleSignature("Double", "toDouble", "vcf qual field"), - "rsid" -> new SimpleSignature("String", "toString", "site rdID"))) + Map("filters" -> new VCFSignature("Set[String]", "toSetString", "filters applied to site"), + "pass" -> new VCFSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"), + "multiallelic" -> new VCFSignature("Boolean", "toBoolean", "Site is a split multiallelic"), + "qual" -> new VCFSignature("Double", "toDouble", "vcf qual field"), + "rsid" -> new VCFSignature("String", "toString", "site rdID"))) val headerLine = headerLines.last assert(headerLine(0) == '#' && headerLine(1) != '#') diff --git a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala index a1a92f6a204..97ef316f889 100644 --- a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala @@ -18,7 +18,7 @@ class AnnotationsSuite extends SparkSuite { /* The below tests are designed to check for a subset of variants and info fields, that: - 1. the types, conversion strings, and description strings agree with the VCF + 1. the types, emitConversionIdentifier strings, and description strings agree with the VCF 2. the strings stored in the AnnotationData classes agree with the VCF 3. the strings stored in the AnnotationData classes convert correctly to the proper type */ @@ -34,7 +34,7 @@ class AnnotationsSuite extends SparkSuite { assert(variantAnnotationMap.contains(anotherVariant)) // type Int - INFO.DP - assert(vas.get("info", "DP").contains(SimpleSignature("Int", "toInt", + assert(vas.get("info", "DP").contains(VCFSignature("Int", "toInt", "Approximate read depth; some reads may have been filtered"))) assert(variantAnnotationMap(firstVariant) .get("info", "DP") @@ -48,7 +48,7 @@ class AnnotationsSuite extends SparkSuite { .get("info", "DP").get.toInt == 20271) // type Double - INFO.HWP - assert(vas.get("info", "HWP").contains(SimpleSignature("Double", "toDouble", + assert(vas.get("info", "HWP").contains(VCFSignature("Double", "toDouble", "P value from test of Hardy Weinberg Equilibrium"))) assert(variantAnnotationMap(firstVariant) .contains("info", "HWP") && @@ -60,7 +60,7 @@ class AnnotationsSuite extends SparkSuite { .get("info", "HWP").get.toDouble, 0.8286)) // type String - INFO.culprit - assert(vas.get("info", "culprit").contains(SimpleSignature("String", "toString", + assert(vas.get("info", "culprit").contains(VCFSignature("String", "toString", "The annotation which was the worst performing in the Gaussian mixture model, " + "likely the reason why the variant was filtered out"))) assert(variantAnnotationMap(firstVariant) @@ -71,7 +71,7 @@ class AnnotationsSuite extends SparkSuite { .contains("FS")) // type Array - INFO.AC (allele count) - assert(vas.get("info", "AC").contains(SimpleSignature("Array[Int]", "toArrayInt", + assert(vas.get("info", "AC").contains(VCFSignature("Array[Int]", "toArrayInt", "Allele count in genotypes, for each ALT allele, in the same order as listed"))) assert(variantAnnotationMap(firstVariant) .get("info", "AC") @@ -87,7 +87,7 @@ class AnnotationsSuite extends SparkSuite { .sameElements(Array(13))) // type Boolean/flag - INFO.DB (dbSNP membership) - assert(vas.get("info", "DB").contains(SimpleSignature("Boolean", "toBoolean", + assert(vas.get("info", "DB").contains(VCFSignature("Boolean", "toBoolean", "dbSNP Membership"))) assert(variantAnnotationMap(firstVariant) .get("info", "DB") @@ -98,7 +98,7 @@ class AnnotationsSuite extends SparkSuite { .contains("info", "DB")) //type Set[String] - assert(vas.get("filters").contains(SimpleSignature("Set[String]", "toSetString", "filters applied to site"))) + assert(vas.get("filters").contains(VCFSignature("Set[String]", "toSetString", "filters applied to site"))) assert(variantAnnotationMap(firstVariant) .get("filters").contains("PASS") && variantAnnotationMap(firstVariant) @@ -109,7 +109,7 @@ class AnnotationsSuite extends SparkSuite { .get("filters").get.toSetString == Set[String]("VQSRTrancheSNP99.95to100.00")) // GATK PASS - assert(vas.get("pass").contains(SimpleSignature("Boolean", "toBoolean", + assert(vas.get("pass").contains(VCFSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"))) assert(variantAnnotationMap(firstVariant) .get("pass").contains("true")) From 7d7026ad96e118dc0b9ff101b9c6d8c1a6302c7d Mon Sep 17 00:00:00 2001 From: tpoterba Date: Thu, 17 Dec 2015 12:36:28 -0500 Subject: [PATCH 10/15] Replaced SimpleSignature with VCFSignature -- added accessor methods here Replaced regex matching for VCF header with htsjdk.variant.vcf classes -- contig is still inaccessible in the htsjdk framework, however Replaced all .getOrElse function calls in the dynamically generated code with .get, so that all variables exposed in filtering are now optional. Fixed and cleaned up several other areas that needed work. --- .../hail/annotations/Annotations.scala | 17 ++- .../hail/annotations/VCFSignature.scala | 49 +++++-- .../broadinstitute/hail/driver/SampleQC.scala | 125 +++++++++--------- .../hail/driver/VariantQC.scala | 99 +++++++------- .../broadinstitute/hail/methods/Filter.scala | 54 +++----- .../broadinstitute/hail/methods/LoadVCF.scala | 36 +++-- .../hail/variant/VariantSampleMatrix.scala | 2 +- .../hail/annotations/AnnotationsSuite.scala | 61 +++++---- .../hail/methods/FilterSuite.scala | 12 +- 9 files changed, 231 insertions(+), 224 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index ca95df04b23..6921b7454e7 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -67,7 +67,7 @@ object AnnotationClassBuilder { case (subclass, subMap) => val attrs = subMap .map { case (k, sig) => - s""" val $k: Option[${sig.emitType}] = subMap.get("$k").${sig.emitConversionIdentifier}""" + s""" val $k: Option[${sig.emitType}] = subMap.get("$k").map(_.${sig.emitConversionIdentifier})""" } .mkString("\n") val methods: String = { @@ -76,17 +76,15 @@ object AnnotationClassBuilder { | ${subMap.keys.toArray.sorted.map(s => s"""formatString($s, "$missing")""").mkString(",")} | ) | override def toString: String = __fields.mkString(";") - | def all: String = __fields.mkString("\t") - """.stripMargin + | def all: String = __fields.mkString("\t")""".stripMargin } else "" } s"""class __${subclass}Annotations(subMap: Map[String, String]) extends Serializable { |$attrs |$methods - |} - |""".stripMargin + |}""".stripMargin } - .foldRight[String]("")(_ + _) + .mkString("\n") val hiddenClass = { val classes = @@ -95,13 +93,14 @@ object AnnotationClassBuilder { } .mkString("\n") val vals = sigs.vals.map { case (k, sig) => - s""" val $k: Option[${sig.emitType}] = annot.getVal("$k").${sig.emitConversionIdentifier}""" + s""" val $k: Option[${sig.emitType}] = annot.getVal("$k").map(_.${sig.emitConversionIdentifier})""" } .mkString("\n") s"""class ${hiddenClassName}Annotations(annot: org.broadinstitute.hail.annotations.AnnotationData) | extends Serializable { - | $classes - | $vals + | ${if (classes.nonEmpty) classes else "// no classes"} + | ${if (vals.nonEmpty) vals else "// no vals"} + |} |""".stripMargin } diff --git a/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala index e8c4cd7ba1d..328badc018c 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala @@ -1,9 +1,14 @@ package org.broadinstitute.hail.annotations +import htsjdk.variant.vcf.{VCFInfoHeaderLine, VCFHeaderLineCount, VCFHeaderLineType} + case class VCFSignature(vcfType: String, emitType: String, number: String, emitConversionIdentifier: String, description: String) extends AnnotationSignature { + def this(scalaType: String, conversionMethod: String, desc: String) = + this("", scalaType, "", conversionMethod, "") + def emitUtilities: String = "" } @@ -11,6 +16,7 @@ object VCFSignature { val arrayRegex = """Array\[(\w+)\]""".r val setRegex = """Set\[(\w+)\]""".r + def getConversionMethod(str: String): String = { str match { case arrayRegex(subType) => s"toArray$subType" @@ -30,20 +36,35 @@ object VCFSignature { case _ => throw new UnsupportedOperationException("unexpected annotation type") } - def parse(number: String, vcfType: String, desc: String): AnnotationSignature = { - val parsedType: String = vcfTypeToScala(vcfType) - - val scalaType: String = { - if (number == "0" || number == "1") { - parsedType - } - else if (number == "A" || number == "R" || number == "G") { - s"Array[$parsedType]" - } - else - throw new UnsupportedOperationException + val integerRegex = """(\d+)""".r + + def parse(line: VCFInfoHeaderLine): AnnotationSignature = { + val vcfType = line.getType.toString + val parsedType = line.getType match { + case VCFHeaderLineType.Integer => "Int" + case VCFHeaderLineType.Float => "Double" + case VCFHeaderLineType.String => "String" + case VCFHeaderLineType.Character => "Character" + case VCFHeaderLineType.Flag => "Boolean" + } + val parsedCount = line.getCountType match { + case VCFHeaderLineCount.A => "A" + case VCFHeaderLineCount.G => "G" + case VCFHeaderLineCount.R => "R" + case VCFHeaderLineCount.INTEGER => line.getCount.toString + case VCFHeaderLineCount.UNBOUNDED => "." + } + val scalaType = parsedCount match { + case "A" | "R" | "G" => s"Array[$parsedType]" + case integerRegex(i) => if (i.toInt > 1) s"Array[$parsedType]" else parsedType + case _ => parsedType } val conversionMethod = getConversionMethod(scalaType) - new VCFSignature(vcfType, scalaType, number, conversionMethod, desc) + val desc = line.getDescription + + + new VCFSignature(vcfType, scalaType, parsedCount, conversionMethod, desc) + + } -} \ No newline at end of file +} diff --git a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala index a4489dec780..c59aadc4b53 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala @@ -47,26 +47,26 @@ object SampleQCCombiner { "nSingleton" -> new VCFSignature("Int", "toInt", ""), "nTransition" -> new VCFSignature("Int", "toInt", ""), "nTransversion" -> new VCFSignature("Int", "toInt", ""), - "dpMean" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpStDev" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqMean" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqStDev" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpMean" -> new VCFSignature("Double", "toDouble", ""), + "dpStDev" -> new VCFSignature("Double", "toDouble", ""), + "dpMeanHomRef" -> new VCFSignature("Double", "toDouble", ""), + "dpStDevHomRef" -> new VCFSignature("Double", "toDouble", ""), + "dpMeanHet" -> new VCFSignature("Double", "toDouble", ""), + "dpStDevHet" -> new VCFSignature("Double", "toDouble", ""), + "dpMeanHomVar" -> new VCFSignature("Double", "toDouble", ""), + "dpStDevHomVar" -> new VCFSignature("Double", "toDouble", ""), + "gqMean" -> new VCFSignature("Double", "toDouble", ""), + "gqStDev" -> new VCFSignature("Double", "toDouble", ""), + "gqMeanHomRef" -> new VCFSignature("Double", "toDouble", ""), + "gqStDevHomRef" -> new VCFSignature("Double", "toDouble", ""), + "gqMeanHet" -> new VCFSignature("Double", "toDouble", ""), + "gqStDevHet" -> new VCFSignature("Double", "toDouble", ""), + "gqMeanHomVar" -> new VCFSignature("Double", "toDouble", ""), + "gqStDevHomVar" -> new VCFSignature("Double", "toDouble", ""), "nNonRef" -> new VCFSignature("Int", "toInt", ""), - "rTiTv" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "rHetHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "rDeletionInsertion" -> new VCFSignature("Option[Double]", "toOptionDouble", "")) + "rTiTv" -> new VCFSignature("Double", "toDouble", ""), + "rHetHomVar" -> new VCFSignature("Double", "toDouble", ""), + "rDeletionInsertion" -> new VCFSignature("Double", "toDouble", "")) } class SampleQCCombiner extends Serializable { @@ -254,37 +254,42 @@ class SampleQCCombiner extends Serializable { } def asMap: Map[String, String] = { - Map("nCalled" -> (nHomRef + nHet + nHomVar).toString, - "nNotCalled" -> nNotCalled.toString, - "nHomRef" -> nHomRef.toString, - "nHet" -> nHet.toString, - "nHomVar" -> nHomVar.toString, - "nSNP" -> nSNP.toString, - "nInsertion" -> nIns.toString, - "nDeletion" -> nDel.toString, - "nSingleton" -> nSingleton.toString, - "nTransition" -> nTi.toString, - "nTransversion" -> nTv.toString, - "dpMean" -> someIf(dpSC.count > 0, dpSC.mean).toString, - "dpStDev" -> someIf(dpSC.count > 0, dpSC.stdev).toString, - "dpMeanHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.mean).toString, - "dpStDevHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.stdev).toString, - "dpMeanHet" -> someIf(dpHetSC.count > 0, dpHetSC.mean).toString, - "dpStDevHet" -> someIf(dpHetSC.count > 0, dpHetSC.stdev).toString, - "dpMeanHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.mean).toString, - "dpStDevHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.stdev).toString, - "gqMean" -> someIf(gqSC.count > 0, gqSC.mean).toString, - "gqStDev" -> someIf(gqSC.count > 0, gqSC.stdev).toString, - "gqMeanHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.mean).toString, - "gqStDevHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.stdev).toString, - "gqMeanHet" -> someIf(gqHetSC.count > 0, gqHetSC.mean).toString, - "gqStDevHet" -> someIf(gqHetSC.count > 0, gqHetSC.stdev).toString, - "gqMeanHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.mean).toString, - "gqStDevHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.stdev).toString, - "nNonRef" -> (nHet + nHomVar).toString, - "rTiTv" -> divOption(nTi, nTv).toString, - "rHetHomVar" -> divOption(nHet, nHomVar).toString, - "rDeletionInsertion" -> divOption(nDel, nIns).toString) + Map("nCalled" -> (nHomRef + nHet + nHomVar), + "nNotCalled" -> nNotCalled, + "nHomRef" -> nHomRef, + "nHet" -> nHet, + "nHomVar" -> nHomVar, + "nSNP" -> nSNP, + "nInsertion" -> nIns, + "nDeletion" -> nDel, + "nSingleton" -> nSingleton, + "nTransition" -> nTi, + "nTransversion" -> nTv, + "dpMean" -> someIf(dpSC.count > 0, dpSC.mean), + "dpStDev" -> someIf(dpSC.count > 0, dpSC.stdev), + "dpMeanHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.mean), + "dpStDevHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.stdev), + "dpMeanHet" -> someIf(dpHetSC.count > 0, dpHetSC.mean), + "dpStDevHet" -> someIf(dpHetSC.count > 0, dpHetSC.stdev), + "dpMeanHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.mean), + "dpStDevHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.stdev), + "gqMean" -> someIf(gqSC.count > 0, gqSC.mean), + "gqStDev" -> someIf(gqSC.count > 0, gqSC.stdev), + "gqMeanHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.mean), + "gqStDevHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.stdev), + "gqMeanHet" -> someIf(gqHetSC.count > 0, gqHetSC.mean), + "gqStDevHet" -> someIf(gqHetSC.count > 0, gqHetSC.stdev), + "gqMeanHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.mean), + "gqStDevHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.stdev), + "nNonRef" -> (nHet + nHomVar), + "rTiTv" -> divOption(nTi, nTv), + "rHetHomVar" -> divOption(nHet, nHomVar), + "rDeletionInsertion" -> divOption(nDel, nIns)) + .flatMap { case (k, v) => v match { + case Some(value) => Some(k, value.toString) + case None => None + case _ => Some(k, v.toString) + }} } } @@ -324,15 +329,15 @@ object SampleQC extends Command { vds .rdd .mapPartitions[(Int, SampleQCCombiner)] { (it: Iterator[(Variant, AnnotationData, Iterable[Genotype])]) => - val zeroValue = Array.fill[SampleQCCombiner](localSamplesBc.value.length)(new SampleQCCombiner) - localSamplesBc.value.iterator - .zip(it.foldLeft(zeroValue) { case (acc, (v, va, gs)) => - val vIsSingleton = gs.iterator.existsExactly1(_.isCalledNonRef) - for ((g, i) <- gs.zipWithIndex) - acc(i) = acc(i).merge(v, vIsSingleton, g) - acc - }.iterator) - }.foldByKey(new SampleQCCombiner)((comb1, comb2) => comb1.merge(comb2)) + val zeroValue = Array.fill[SampleQCCombiner](localSamplesBc.value.length)(new SampleQCCombiner) + localSamplesBc.value.iterator + .zip(it.foldLeft(zeroValue) { case (acc, (v, va, gs)) => + val vIsSingleton = gs.iterator.existsExactly1(_.isCalledNonRef) + for ((g, i) <- gs.zipWithIndex) + acc(i) = acc(i).merge(v, vIsSingleton, g) + acc + }.iterator) + }.foldByKey(new SampleQCCombiner)((comb1, comb2) => comb1.merge(comb2)) } def run(state: State, options: Options): State = { @@ -347,7 +352,7 @@ object SampleQC extends Command { val r = results(vds).collectAsMap() val newAnnotations = vds.metadata.sampleAnnotations .zipWithIndex - .map{ case (sa, s) => sa.addMap("qc", r(s).asMap) } + .map { case (sa, s) => sa.addMap("qc", r(s).asMap) } state.copy( vds = vds.copy( metadata = vds.metadata.copy( diff --git a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala index e2fc7a4a055..32380ff62be 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala @@ -36,27 +36,27 @@ object VariantQCCombiner { "nHomRef" -> new VCFSignature("Int", "toInt", ""), "nHet" -> new VCFSignature("Int", "toInt", ""), "nHomVar" -> new VCFSignature("Int", "toInt", ""), - "dpMean" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpStDev" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpMeanHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "dpStDevHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqMean" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqStDev" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHomRef" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHet" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqMeanHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "gqStDevHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "MAF" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "dpMean" -> new VCFSignature("Double", "toDouble", ""), + "dpStDev" -> new VCFSignature("Double", "toDouble", ""), + "dpMeanHomRef" -> new VCFSignature("Double", "toDouble", ""), + "dpStDevHomRef" -> new VCFSignature("Double", "toDouble", ""), + "dpMeanHet" -> new VCFSignature("Double", "toDouble", ""), + "dpStDevHet" -> new VCFSignature("Double", "toDouble", ""), + "dpMeanHomVar" -> new VCFSignature("Double", "toDouble", ""), + "dpStDevHomVar" -> new VCFSignature("Double", "toDouble", ""), + "gqMean" -> new VCFSignature("Double", "toDouble", ""), + "gqStDev" -> new VCFSignature("Double", "toDouble", ""), + "gqMeanHomRef" -> new VCFSignature("Double", "toDouble", ""), + "gqStDevHomRef" -> new VCFSignature("Double", "toDouble", ""), + "gqMeanHet" -> new VCFSignature("Double", "toDouble", ""), + "gqStDevHet" -> new VCFSignature("Double", "toDouble", ""), + "gqMeanHomVar" -> new VCFSignature("Double", "toDouble", ""), + "gqStDevHomVar" -> new VCFSignature("Double", "toDouble", ""), + "MAF" -> new VCFSignature("Double", "toDouble", ""), "nNonRef" -> new VCFSignature("Int", "toInt", ""), - "rHeterozygosity" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "rHetHomVar" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), - "rExpectedHetFrequency" -> new VCFSignature("Option[Double]", "toOptionDouble", ""), + "rHeterozygosity" -> new VCFSignature("Double", "toDouble", ""), + "rHetHomVar" -> new VCFSignature("Double", "toDouble", ""), + "rExpectedHetFrequency" -> new VCFSignature("Double", "toDouble", ""), "pHWE" -> new VCFSignature("Double", "toDouble", "")) } @@ -210,33 +210,38 @@ class VariantQCCombiner extends Serializable { val hwe = HWEStats - Map("nCalled" -> (nHomRef + nHet + nHomVar).toString, - "nNotCalled" -> nNotCalled.toString, - "nHomRef" -> nHomRef.toString, - "nHet" -> nHet.toString, - "nHomVar" -> nHomVar.toString, - "dpMean" -> someIf(dpSC.count > 0, dpSC.mean).toString, - "dpStDev" -> someIf(dpSC.count > 0, dpSC.stdev).toString, - "dpMeanHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.mean).toString, - "dpStDevHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.stdev).toString, - "dpMeanHet" -> someIf(dpHetSC.count > 0, dpHetSC.mean).toString, - "dpStDevHet" -> someIf(dpHetSC.count > 0, dpHetSC.stdev).toString, - "dpMeanHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.mean).toString, - "dpStDevHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.stdev).toString, - "gqMean" -> someIf(gqSC.count > 0, gqSC.mean).toString, - "gqStDev" -> someIf(gqSC.count > 0, gqSC.stdev).toString, - "gqMeanHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.mean).toString, - "gqStDevHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.stdev).toString, - "gqMeanHet" -> someIf(gqHetSC.count > 0, gqHetSC.mean).toString, - "gqStDevHet" -> someIf(gqHetSC.count > 0, gqHetSC.stdev).toString, - "gqMeanHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.mean).toString, - "gqStDevHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.stdev).toString, - "MAF" -> maf.toString, - "nNonRef" -> (nHet + nHomVar).toString, - "rHeterozygosity" -> divOption(nHet, nHomRef + nHet + nHomVar).toString, - "rHetHomVar" -> divOption(nHet, nHomVar).toString, - "rExpectedHetFrequency" -> hwe._1.toString, - "pHWE" -> hwe._2.toString) + Map("nCalled" -> (nHomRef + nHet + nHomVar), + "nNotCalled" -> nNotCalled, + "nHomRef" -> nHomRef, + "nHet" -> nHet, + "nHomVar" -> nHomVar, + "dpMean" -> someIf(dpSC.count > 0, dpSC.mean), + "dpStDev" -> someIf(dpSC.count > 0, dpSC.stdev), + "dpMeanHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.mean), + "dpStDevHomRef" -> someIf(dpHomRefSC.count > 0, dpHomRefSC.stdev), + "dpMeanHet" -> someIf(dpHetSC.count > 0, dpHetSC.mean), + "dpStDevHet" -> someIf(dpHetSC.count > 0, dpHetSC.stdev), + "dpMeanHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.mean), + "dpStDevHomVar" -> someIf(dpHomVarSC.count > 0, dpHomVarSC.stdev), + "gqMean" -> someIf(gqSC.count > 0, gqSC.mean), + "gqStDev" -> someIf(gqSC.count > 0, gqSC.stdev), + "gqMeanHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.mean), + "gqStDevHomRef" -> someIf(gqHomRefSC.count > 0, gqHomRefSC.stdev), + "gqMeanHet" -> someIf(gqHetSC.count > 0, gqHetSC.mean), + "gqStDevHet" -> someIf(gqHetSC.count > 0, gqHetSC.stdev), + "gqMeanHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.mean), + "gqStDevHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.stdev), + "MAF" -> maf, + "nNonRef" -> (nHet + nHomVar), + "rHeterozygosity" -> divOption(nHet, nHomRef + nHet + nHomVar), + "rHetHomVar" -> divOption(nHet, nHomVar), + "rExpectedHetFrequency" -> hwe._1, + "pHWE" -> hwe._2) + .flatMap { case (k, v) => v match { + case Some(value) => Some(k, value.toString) + case None => None + case _ => Some(k, v.toString) + }} } } diff --git a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala index 21dd572dbfe..5f855f04f06 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala @@ -15,28 +15,12 @@ class FilterString(val s: String) extends AnyVal { def !~(t: String): Boolean = !this.~(t) } -object ConvertibleString { - val someRegex = """Some\(([0-9\.]+)\)""".r -} - class ConvertibleString(val s: String) extends AnyVal { def toArrayInt: Array[Int] = s.split(",").map(i => i.toInt) def toArrayDouble: Array[Double] = s.split(",").map(i => i.toDouble) def toSetString: Set[String] = s.split(",").toSet - - def toStupidAnnotation: Array[Array[String]] = s.split(",").map(_.split("|").map(_.trim)) - - def toOptionInt: Option[Int] = s match { - case ConvertibleString.someRegex(i) => Some(i.toInt) - case "None" => None - } - - def toOptionDouble: Option[Double] = s match { - case ConvertibleString.someRegex(i) => Some(i.toDouble) - case "None" => None - } } object FilterUtils { @@ -47,10 +31,6 @@ object FilterUtils { implicit def toFilterString(s: String): FilterString = new FilterString(s) implicit def toConvertibleString(s: String): ConvertibleString = new ConvertibleString(s) - - // def test(): (Variant, Annotations[String]) => Boolean = { - // throw new UnsupportedOperationException - // } } class EvaluatorWithTransformation[T, S](t: String, f: T => S)(implicit tct: ClassTag[T]) extends Serializable { @@ -128,23 +108,23 @@ class FilterSampleCondition(cond: String, sas: AnnotationSignatures) class FilterGenotypeCondition(cond: String, vas: AnnotationSignatures, sas: AnnotationSignatures, sad: IndexedSeq[AnnotationData], ids: IndexedSeq[String]) extends EvaluatorWithTransformation[FilterGenotypeWithSA, FilterGenotypePostSA]( - s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], - | __ids: IndexedSeq[String]) => { - | import org.broadinstitute.hail.methods.FilterUtils._ - | ${signatures(sas, "__sa")} - | ${makeIndexedSeq("__saArray", "__sa", "__sa")} - | (v: org.broadinstitute.hail.variant.Variant, - | __va: org.broadinstitute.hail.annotations.AnnotationData) => { - | ${signatures(vas, "__va")} - | ${instantiate("va", "__va")} - | (__sIndex: Int, - | g: org.broadinstitute.hail.variant.Genotype) => { - | val sa = __saArray(__sIndex) - | val s = org.broadinstitute.hail.variant.Sample(__ids(__sIndex)) - | $cond - | }: Boolean - | } - | } + s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], + | __ids: IndexedSeq[String]) => { + | import org.broadinstitute.hail.methods.FilterUtils._ + | ${signatures(sas, "__sa")} + | ${makeIndexedSeq("__saArray", "__sa", "__sa")} + | (v: org.broadinstitute.hail.variant.Variant, + | __va: org.broadinstitute.hail.annotations.AnnotationData) => { + | ${signatures(vas, "__va")} + | ${instantiate("va", "__va")} + | (__sIndex: Int, + | g: org.broadinstitute.hail.variant.Genotype) => { + | val sa = __saArray(__sIndex) + | val s = org.broadinstitute.hail.variant.Sample(__ids(__sIndex)) + | $cond + | }: Boolean + | } + | } """.stripMargin, t => t(sad, ids)) { def apply(v: Variant, va: AnnotationData)(sIndex: Int, g: Genotype): Boolean = diff --git a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala index f8328075ebe..8a61996669a 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala @@ -3,12 +3,12 @@ package org.broadinstitute.hail.methods import org.broadinstitute.hail.vcf.BufferedLineIterator import scala.io.Source -import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.SparkContext import org.broadinstitute.hail.variant._ import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.vcf import org.broadinstitute.hail.annotations._ -import scala.collection.convert._ +import scala.collection.JavaConversions._ object LoadVCF { // FIXME move to VariantDataset @@ -30,18 +30,20 @@ object LoadVCF { } val codec = new htsjdk.variant.vcf.VCFCodec() + val header = codec.readHeader(new BufferedLineIterator(headerLines.iterator.buffered)) + .getHeaderValue .asInstanceOf[htsjdk.variant.vcf.VCFHeader] - val contigs = header.getContigLines. + // FIXME: use htsjdk to parse contigs when they expose the correct fields val contigRegex ="""##contig=""".r - val contigLengths = { + val contigs = { val contigMap = headerLines.map { case contigRegex(id, length) => Some((id, length.toInt)) case _ => None }.flatMap(i => i) - .toMap + .toMap if (contigMap.nonEmpty) contigMap @@ -49,24 +51,18 @@ object LoadVCF { null } - val annoRegex = """##INFO=""".r - val annotationTypes = { - val annotationMap = headerLines.flatMap { - case annoRegex(id, number, typeOf, desc) => Some(id, VCFSignature.parse(number, typeOf, desc)) - case _ => None - }.toMap + val infoSignatures = header + .getInfoHeaderLines + .toList + .map(line => (line.getID, VCFSignature.parse(line))) + .toMap - if (annotationMap.nonEmpty) - annotationMap - else - Map.empty[String, AnnotationSignature] - } - val annotationSignatures: AnnotationSignatures = Annotations[AnnotationSignature](Map("info" -> annotationTypes), - Map("filters" -> new VCFSignature("Set[String]", "toSetString", "filters applied to site"), + val annotationSignatures: AnnotationSignatures = Annotations[AnnotationSignature](Map("info" -> infoSignatures), + Map("filters" -> new VCFSignature("Set[String]","toSetString", "filters applied to site"), "pass" -> new VCFSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"), "multiallelic" -> new VCFSignature("Boolean", "toBoolean", "Site is a split multiallelic"), "qual" -> new VCFSignature("Double", "toDouble", "vcf qual field"), - "rsid" -> new VCFSignature("String", "toString", "site rdID"))) + "rsid" -> new VCFSignature("String", "toString", "site rsID"))) val headerLine = headerLines.last assert(headerLine(0) == '#' && headerLine(1) != '#') @@ -92,7 +88,7 @@ object LoadVCF { } } - VariantSampleMatrix(VariantMetadata(contigLengths, sampleIds, + VariantSampleMatrix(VariantMetadata(contigs, sampleIds, headerLines, sampleAnnotations, sampleAnnotationSignatures, annotationSignatures), genotypes) } } diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala index c1320e4e08d..aba091a53c7 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala @@ -216,6 +216,7 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, val zeroArray = new Array[Byte](zeroBuffer.limit) zeroBuffer.get(zeroArray) + rdd .map { case (v, va, gs) => val serializer = SparkEnv.get.serializer.newInstance() @@ -255,7 +256,6 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, rdd.map { case (v, va, gs) => (v, gs.foldLeft(zeroValue)((acc, g) => combOp(acc, g))) } def same(that: VariantSampleMatrix[T]): Boolean = { - println(metadata == that.metadata) metadata == that.metadata && localSamples.sameElements(that.localSamples) && rdd.map { case (v, va, gs) => (v, (va, gs)) } diff --git a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala index 97ef316f889..5f044407fd7 100644 --- a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala @@ -4,7 +4,6 @@ import org.broadinstitute.hail.SparkSuite import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.driver._ import org.broadinstitute.hail.variant.{Genotype, IntervalList, Variant} -import org.scalacheck.Gen import org.testng.annotations.Test import org.broadinstitute.hail.methods._ import org.broadinstitute.hail.methods.FilterUtils.toConvertibleString @@ -34,86 +33,86 @@ class AnnotationsSuite extends SparkSuite { assert(variantAnnotationMap.contains(anotherVariant)) // type Int - INFO.DP - assert(vas.get("info", "DP").contains(VCFSignature("Int", "toInt", + assert(vas.getInMap("info", "DP").contains(VCFSignature("Integer", "Int", "1", "toInt", "Approximate read depth; some reads may have been filtered"))) assert(variantAnnotationMap(firstVariant) - .get("info", "DP") + .getInMap("info", "DP") .contains("77560") && variantAnnotationMap(firstVariant) - .get("info", "DP").get.toInt == 77560) + .getInMap("info", "DP").get.toInt == 77560) assert(variantAnnotationMap(anotherVariant) - .get("info", "DP") + .getInMap("info", "DP") .contains("20271") && variantAnnotationMap(anotherVariant) - .get("info", "DP").get.toInt == 20271) + .getInMap("info", "DP").get.toInt == 20271) // type Double - INFO.HWP - assert(vas.get("info", "HWP").contains(VCFSignature("Double", "toDouble", + assert(vas.getInMap("info", "HWP").contains(new VCFSignature("Float", "Double", "1", "toDouble", "P value from test of Hardy Weinberg Equilibrium"))) assert(variantAnnotationMap(firstVariant) - .contains("info", "HWP") && + .containsInMap("info", "HWP") && D_==(variantAnnotationMap(firstVariant) - .get("info", "HWP").get.toDouble, 0.0001)) + .getInMap("info", "HWP").get.toDouble, 0.0001)) assert(variantAnnotationMap(anotherVariant) - .contains("info", "HWP") && + .containsInMap("info", "HWP") && D_==(variantAnnotationMap(anotherVariant) - .get("info", "HWP").get.toDouble, 0.8286)) + .getInMap("info", "HWP").get.toDouble, 0.8286)) // type String - INFO.culprit - assert(vas.get("info", "culprit").contains(VCFSignature("String", "toString", + assert(vas.getInMap("info", "culprit").contains(VCFSignature("String", "String", "1", "toString", "The annotation which was the worst performing in the Gaussian mixture model, " + "likely the reason why the variant was filtered out"))) assert(variantAnnotationMap(firstVariant) - .get("info", "culprit") + .getInMap("info", "culprit") .contains("FS")) assert(variantAnnotationMap(anotherVariant) - .get("info", "culprit") + .getInMap("info", "culprit") .contains("FS")) // type Array - INFO.AC (allele count) - assert(vas.get("info", "AC").contains(VCFSignature("Array[Int]", "toArrayInt", + assert(vas.getInMap("info", "AC").contains(VCFSignature("Integer", "Array[Int]", "A", "toArrayInt", "Allele count in genotypes, for each ALT allele, in the same order as listed"))) assert(variantAnnotationMap(firstVariant) - .get("info", "AC") + .getInMap("info", "AC") .contains("89") && variantAnnotationMap(firstVariant) - .get("info", "AC").get.toArrayInt + .getInMap("info", "AC").get.toArrayInt .sameElements(Array(89))) assert(variantAnnotationMap(anotherVariant) - .get("info", "AC") + .getInMap("info", "AC") .contains("13") && variantAnnotationMap(anotherVariant) - .get("info", "AC").get.toArrayInt + .getInMap("info", "AC").get.toArrayInt .sameElements(Array(13))) // type Boolean/flag - INFO.DB (dbSNP membership) - assert(vas.get("info", "DB").contains(VCFSignature("Boolean", "toBoolean", + assert(vas.getInMap("info", "DB").contains(new VCFSignature("Flag", "Boolean", "0", "toBoolean", "dbSNP Membership"))) assert(variantAnnotationMap(firstVariant) - .get("info", "DB") + .getInMap("info", "DB") .contains("true") && variantAnnotationMap(firstVariant) - .get("info", "DB").get.toBoolean) // .get.toBoolean == true + .getInMap("info", "DB").get.toBoolean) // .get.toBoolean == true assert(!variantAnnotationMap(anotherVariant) - .contains("info", "DB")) + .containsInMap("info", "DB")) //type Set[String] - assert(vas.get("filters").contains(VCFSignature("Set[String]", "toSetString", "filters applied to site"))) + assert(vas.getVal("filters").contains(new VCFSignature("Set[String]", "toSetString", "filters applied to site"))) assert(variantAnnotationMap(firstVariant) - .get("filters").contains("PASS") && + .getVal("filters").contains("PASS") && variantAnnotationMap(firstVariant) - .get("filters").get.toSetString == Set[String]("PASS")) + .getVal("filters").get.toSetString == Set[String]("PASS")) assert(variantAnnotationMap(anotherVariant) - .get("filters").contains("VQSRTrancheSNP99.95to100.00") && + .getVal("filters").contains("VQSRTrancheSNP99.95to100.00") && variantAnnotationMap(anotherVariant) - .get("filters").get.toSetString == Set[String]("VQSRTrancheSNP99.95to100.00")) + .getVal("filters").get.toSetString == Set[String]("VQSRTrancheSNP99.95to100.00")) // GATK PASS - assert(vas.get("pass").contains(VCFSignature("Boolean", "toBoolean", + assert(vas.getVal("pass").contains(new VCFSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"))) assert(variantAnnotationMap(firstVariant) - .get("pass").contains("true")) + .getVal("pass").contains("true")) assert(variantAnnotationMap(anotherVariant) - .get("pass").contains("false")) + .getVal("pass").contains("false")) } } diff --git a/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala index 4805ae4a1c0..87b2a248bc7 100644 --- a/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala @@ -23,10 +23,12 @@ class FilterSuite extends SparkSuite { // the below command will test typing of runtime-generated code exposing annotations FilterGenotypes.run(state, Array("--keep", "-c", - """assert(va.pass.getClass.getName == "boolean");""" + - """assert(va.info.AN.getClass.getName == "int");""" + - """assert(va.info.GQ_MEAN.getClass.getName == "double");""" + - """assert(va.info.AC.getClass.getName == "int[]");""" + - """assert(va.filters.getClass.getName.contains("scala.collection.immutable.Set"));true""")) + """assert(va.pass.forall(_.getClass.getName == "boolean"), "va.pass was not a boolean") + |assert(va.info.AN.forall(_.getClass.getName == "int"), "AN was not an int") + |assert(va.info.GQ_MEAN.forall(_.getClass.getName == "double"), "GQ_MEAN was not a double") + |assert(va.info.AC.forall(_.getClass.getSimpleName == "int[]"), "AC was not an int array") + |assert(va.filters.forall(_.getClass.getName.contains("scala.collection.immutable.Set")), + | "filters was not a set") + |true""".stripMargin)).vds.expand().collect() } } From d4a3acdcfe475ec022e7053aea4507432c744386 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Mon, 21 Dec 2015 11:12:43 -0500 Subject: [PATCH 11/15] Failing illegal cyclic reference involving object InterfaceAudience Works when toTSVString is inside UserExportUtils Fails when inside Utils, even if you just import that static method --- .../scala/org/broadinstitute/hail/Utils.scala | 50 +++++--------- .../hail/annotations/Annotations.scala | 15 ++-- .../hail/driver/ExportGenotypes.scala | 6 +- .../hail/driver/ExportSamples.scala | 6 +- .../hail/driver/ExportVariants.scala | 6 +- .../broadinstitute/hail/driver/SampleQC.scala | 2 +- .../hail/driver/VariantQC.scala | 4 +- .../hail/methods/Evaluator.scala | 61 ++++++++++++++++ .../hail/methods/ExportTSV.scala | 69 +++++++++++-------- .../broadinstitute/hail/methods/Filter.scala | 48 ------------- .../broadinstitute/hail/methods/LoadVCF.scala | 5 ++ .../hail/methods/ExportSuite.scala | 10 ++- 12 files changed, 145 insertions(+), 137 deletions(-) create mode 100644 src/main/scala/org/broadinstitute/hail/methods/Evaluator.scala diff --git a/src/main/scala/org/broadinstitute/hail/Utils.scala b/src/main/scala/org/broadinstitute/hail/Utils.scala index e4b2fdcfa64..907981a55a0 100644 --- a/src/main/scala/org/broadinstitute/hail/Utils.scala +++ b/src/main/scala/org/broadinstitute/hail/Utils.scala @@ -15,8 +15,7 @@ import breeze.linalg.{Vector => BVector, DenseVector => BDenseVector, SparseVect import org.apache.spark.mllib.linalg.{Vector => SVector, DenseVector => SDenseVector, SparseVector => SSparseVector} import scala.reflect.ClassTag import org.broadinstitute.hail.Utils._ -import scala.reflect.runtime.currentMirror -import scala.tools.reflect.ToolBox + // FIXME AnyVal in Scala 2.11 class RichVector[T](v: Vector[T]) { @@ -209,7 +208,9 @@ class RichRDD[T](val r: RDD[T]) extends AnyVal { def writeTable(filename: String, header: String = null) { if (header != null) - writeTextFile(filename + ".header", r.sparkContext.hadoopConfiguration) {_.write(header)} + writeTextFile(filename + ".header", r.sparkContext.hadoopConfiguration) { + _.write(header) + } hadoopDelete(filename, r.sparkContext.hadoopConfiguration, recursive = true) r.saveAsTextFile(filename) } @@ -238,12 +239,9 @@ class RichOption[T](val o: Option[T]) extends AnyVal { } class RichStringBuilder(val sb: mutable.StringBuilder) extends AnyVal { - def tsvAppend[T](v: Option[T]) { - v match { - case Some(d: Double) => sb.append(stringFormatDouble(d)) - case Some(x) => sb.append(x) - case None => sb.append("NA") - } + def tsvAppend(a: Any) { +// sb.append(org.broadinstitute.hail.methods.UserExportUtils.toTSVString(a)) + sb.append(org.broadinstitute.hail.Utils.toTSVString(a)) } } @@ -428,16 +426,6 @@ object Utils { if (!p) throw new AssertionError } - def stringFormatDouble(d: Double): String = { - d.formatted("%.4e") - } - - def writeOption(o: Option[Any], missingValue: String = "NA"): String = o match { - case Some(d: Double) => stringFormatDouble(d) - case Some(x) => x.toString - case None => missingValue - } - // FIXME Would be nice to have a version that averages three runs, perhaps even discarding an initial run. In this case the code block had better be functional! def printTime[T](block: => T) = { val timed = time(block) @@ -475,11 +463,6 @@ object Utils { } } - def toTSVString(a: Any): String = a match { - case o: Option[Any] => o.map(toTSVString).getOrElse("NA") - case _ => a.toString - } - def someIf[T](p: Boolean, x: => T): Option[T] = if (p) Some(x) @@ -516,14 +499,6 @@ object Utils { def flushDouble(a: Double): Double = if (math.abs(a) < java.lang.Double.MIN_NORMAL) 0.0 else a - - def eval[T](t: String): T = { - val toolbox = currentMirror.mkToolBox() - val ast = toolbox.parse(t) - toolbox.typeCheck(ast) - toolbox.eval(ast).asInstanceOf[T] - } - def genOption[T](g: Gen[T], someFrequency: Int = 4): Gen[Option[T]] = Gen.frequency((1, Gen.const(None)), (someFrequency, g.map(Some(_)))) @@ -536,4 +511,15 @@ object Utils { implicit def richIterator[T](it: Iterator[T]): RichIterator[T] = new RichIterator[T](it) + + def toTSVString(a: Any): String = { + a match { + // case Some(o) => toTSVString(o) + case None => "NA" + case d: Double => d.formatted("%.4e") + // case i: Iterable[_] => i.map(toTSVString).mkString(",") + // case arr: Array[_] => arr.map(toTSVString).mkString(",") + case _ => a.toString + } + } } diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index 6921b7454e7..0655cd9f152 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -34,19 +34,16 @@ case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T object Annotations { - def emptyOf[T](): Annotations[T] = { + def empty[T](): Annotations[T] = Annotations(Map.empty[String, Map[String, T]], Map.empty[String, T]) - } - def emptyOfSignature(): AnnotationSignatures = - Annotations(Map.empty[String, Map[String, AnnotationSignature]], Map.empty[String, AnnotationSignature]) + def emptyOfSignature(): AnnotationSignatures = empty[AnnotationSignature]() - def emptyOfString(): AnnotationData = - Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String]) + def emptyOfString(): AnnotationData = empty[String]() def emptyOfArrayString(nSamples: Int): IndexedSeq[AnnotationData] = (0 until nSamples) - .map(i => Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String])) + .map(i => empty[String]()) } object AnnotationUtils { @@ -62,7 +59,7 @@ object AnnotationUtils { object AnnotationClassBuilder { def signatures(sigs: AnnotationSignatures, hiddenClassName: String, - makeToString: Boolean = false, missing: String = ""): String = { + makeToString: Boolean = false): String = { val internalClasses = sigs.maps.map { case (subclass, subMap) => val attrs = subMap @@ -73,7 +70,7 @@ object AnnotationClassBuilder { val methods: String = { if (makeToString) { s""" def __fields: Array[String] = Array( - | ${subMap.keys.toArray.sorted.map(s => s"""formatString($s, "$missing")""").mkString(",")} + | ${subMap.keys.toArray.sorted.map(s => s"""toTSVString($s)""").mkString(",")} | ) | override def toString: String = __fields.mkString(";") | def all: String = __fields.mkString("\t")""".stripMargin diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala index 33e31cfa121..ecc2b4b7528 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala @@ -17,10 +17,6 @@ object ExportGenotypes extends Command { @Args4jOption(required = true, name = "-c", aliases = Array("--condition"), usage = "Comma-separated list of fields to be printed to tsv") var condition: String = _ - - @Args4jOption(required = false, name = "--missing", - usage = "Format of missing values (Default: 'NA')") - var missing = "NA" } def newOptions = new Options @@ -41,7 +37,7 @@ object ExportGenotypes extends Command { val makeString: ((Variant, AnnotationData) => ((Int, Genotype) => String)) = { - val cf = new ExportGenotypeEvaluator(options.condition, vas, sas, sa, ids, options.missing) + val cf = new ExportGenotypeEvaluator(options.condition, vas, sas, sa, ids) cf.typeCheck() cf.apply } diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala index ad2e67ec83f..1153ce9dcde 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala @@ -17,10 +17,6 @@ object ExportSamples extends Command { @Args4jOption(required = true, name = "-c", aliases = Array("--condition"), usage = "Comma-separated list of fields to be printed to tsv") var condition: String = _ - - @Args4jOption(required = false, name = "--missing", - usage = "Format of missing values (Default: 'NA')") - var missing = "NA" } def newOptions = new Options @@ -39,7 +35,7 @@ object ExportSamples extends Command { val sas = vds.metadata.sampleAnnotationSignatures val makeString: (Sample, Annotations[String]) => String = { try { - val ese = new ExportSamplesEvaluator(cond, sas, options.missing) + val ese = new ExportSamplesEvaluator(cond, sas) ese.typeCheck() ese.apply } catch { diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala index 204fb32bd59..e18d80a6a7e 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala @@ -17,10 +17,6 @@ object ExportVariants extends Command { @Args4jOption(required = true, name = "-c", aliases = Array("--condition"), usage = "Comma-separated list of fields to be printed to tsv") var condition: String = _ - - @Args4jOption(required = false, name = "--missing", - usage = "Format of missing values (Default: 'NA')") - var missing = "NA" } def newOptions = new Options @@ -39,7 +35,7 @@ object ExportVariants extends Command { val vas = vds.metadata.variantAnnotationSignatures val makeString: (Variant, Annotations[String]) => String = { try { - val eve = new ExportVariantsEvaluator(cond, vas, options.missing) + val eve = new ExportVariantsEvaluator(cond, vas) eve.typeCheck() eve.apply } catch { diff --git a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala index c59aadc4b53..3ccaeca70cc 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala @@ -254,7 +254,7 @@ class SampleQCCombiner extends Serializable { } def asMap: Map[String, String] = { - Map("nCalled" -> (nHomRef + nHet + nHomVar), + Map[String, Any]("nCalled" -> (nHomRef + nHet + nHomVar), "nNotCalled" -> nNotCalled, "nHomRef" -> nHomRef, "nHet" -> nHet, diff --git a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala index 32380ff62be..abade95d776 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala @@ -199,7 +199,7 @@ class VariantQCCombiner extends Serializable { val hwe = HWEStats sb.tsvAppend(hwe._1) sb += '\t' - sb ++= stringFormatDouble(hwe._2) + sb.tsvAppend(hwe._2) } def asMap: Map[String, String] = { @@ -210,7 +210,7 @@ class VariantQCCombiner extends Serializable { val hwe = HWEStats - Map("nCalled" -> (nHomRef + nHet + nHomVar), + Map[String, Any]("nCalled" -> (nHomRef + nHet + nHomVar), "nNotCalled" -> nNotCalled, "nHomRef" -> nHomRef, "nHet" -> nHet, diff --git a/src/main/scala/org/broadinstitute/hail/methods/Evaluator.scala b/src/main/scala/org/broadinstitute/hail/methods/Evaluator.scala new file mode 100644 index 00000000000..7b531c8f750 --- /dev/null +++ b/src/main/scala/org/broadinstitute/hail/methods/Evaluator.scala @@ -0,0 +1,61 @@ +package org.broadinstitute.hail.methods + +import java.io.Serializable +import scala.reflect.ClassTag + + +class EvaluatorWithTransformation[T, S](t: String, f: T => S)(implicit tct: ClassTag[T]) extends Serializable { + @transient var p: Option[S] = None + + def typeCheck() { + require(p.isEmpty) + p = Some(f(Evaluator.eval[T](t))) + } + + def eval(): S = p match { + case null | None => + val v = f(Evaluator.eval[T](t)) + p = Some(v) + v + case Some(v) => v + } +} + +class Evaluator[T](t: String)(implicit tct: ClassTag[T]) + extends Serializable { + @transient var p: Option[T] = None + + def typeCheck() { + require(p.isEmpty) + try { + p = Some(Evaluator.eval[T](t)) + } + catch { + case e: scala.tools.reflect.ToolBoxError => + /* e.message looks like: + reflective compilation has failed: + + ';' expected but '.' found. */ + org.broadinstitute.hail.Utils.fatal("parse error in condition: " + e.message.split("\n").last) + } + } + + def eval(): T = p match { + case null | None => + val v = Evaluator.eval[T](t) + p = Some(v) + v + case Some(v) => v + } +} + +object Evaluator { + import scala.reflect.runtime.currentMirror + import scala.tools.reflect.ToolBox + def eval[T](t: String): T = { + val toolbox = currentMirror.mkToolBox() + val ast = toolbox.parse(t) + toolbox.typeCheck(ast) + toolbox.eval(ast).asInstanceOf[T] + } +} \ No newline at end of file diff --git a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala index 8076ba7fb63..904d15e7c72 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala @@ -2,42 +2,43 @@ package org.broadinstitute.hail.methods import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ import org.broadinstitute.hail.annotations._ -import org.broadinstitute.hail.methods.ExportUtils.{ExportGenotypePostSA, ExportGenotypeWithSA} import org.broadinstitute.hail.variant.{Sample, Variant, Genotype} -import org.broadinstitute.hail.Utils._ import scala.language.implicitConversions -object ExportUtils { - type ExportGenotypeWithSA = ((IndexedSeq[AnnotationData], IndexedSeq[String]) => - ((Variant, AnnotationData) => ((Int, Genotype) => String))) - type ExportGenotypePostSA = (Variant, AnnotationData) => ((Int, Genotype) => String) -} - object UserExportUtils { - def formatString(a: Any, missingValue: String): String = a match { - case o: Option[Any] => writeOption(o, missingValue) - case d: Double => stringFormatDouble(d) - case i: Iterable[Any] => if (i.isEmpty) "" else i.map(formatString(_, missingValue)).mkString(",") - case _ => a.toString - } - class ExportVariant(val v: Variant) extends AnyVal { def contig = v.contig + def start = v.start + def ref = v.ref + def alt = v.alt + def variantType = v.variantType + def inParX = v.inParX + def inParY = v.inParY + def isSNP = v.isSNP + def isMNP = v.isMNP + def isInsertion = v.isInsertion + def isDeletion = v.isDeletion + def isIndel = v.isIndel - def isCopmlex = v.isComplex + + def isComplex = v.isComplex + def isTransition = v.isTransition + def isTransversion = v.isTransversion + def nMismatch = v.nMismatch + override def toString: String = { s"${contig}_${start}_${ref}_$alt" } @@ -45,43 +46,54 @@ object UserExportUtils { } -class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures, missingValue: String) +class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures) extends Evaluator[(Variant, AnnotationData) => String]({ s"""(__v: org.broadinstitute.hail.variant.Variant, | __va: org.broadinstitute.hail.annotations.AnnotationData) => { | import org.broadinstitute.hail.methods.FilterUtils._ | import org.broadinstitute.hail.methods.UserExportUtils._ + | + | | val v: ExportVariant = new ExportVariant(__v) - | ${signatures(vas, "__va", makeToString = true, missing = missingValue)} + | ${signatures(vas, "__va", makeToString = true)} | ${instantiate("va", "__va")} - | Array($list).map(formatString(_, "$missingValue")).mkString("\t") + | Array($list).map(toTSVString).mkString("\t") |}: String """.stripMargin}) { def apply(v: Variant, va: AnnotationData): String = eval()(v, va) } -class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures, missingValue: String) +class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures) extends Evaluator[(Sample, AnnotationData) => String]( - s"""(s: org.broadinstitute.hail.variant.Sample, + {val s = s"""(s: org.broadinstitute.hail.variant.Sample, | __sa: org.broadinstitute.hail.annotations.AnnotationData) => { | import org.broadinstitute.hail.methods.FilterUtils._ | import org.broadinstitute.hail.methods.UserExportUtils._ - | ${signatures(sas, "__sa", makeToString = true, missing = missingValue)} + | import org.broadinstitute.hail.Utils.toTSVString + | ${signatures(sas, "__sa", makeToString = true)} | ${instantiate("sa", "__sa")} - | Array($list).map(formatString(_, "$missingValue")).mkString("\t") + | Array($list).map(toTSVString).mkString("\t") |}: String - """.stripMargin) { + """.stripMargin;println(s);s}) { def apply(s: Sample, sa: AnnotationData): String = eval()(s, sa) } +object ExportGenotypeEvaluator { + type ExportGenotypeWithSA = ((IndexedSeq[AnnotationData], IndexedSeq[String]) => + ((Variant, AnnotationData) => ((Int, Genotype) => String))) + type ExportGenotypePostSA = (Variant, AnnotationData) => ((Int, Genotype) => String) +} + class ExportGenotypeEvaluator(list: String, vas: AnnotationSignatures, sas: AnnotationSignatures, - sad: IndexedSeq[AnnotationData], ids: IndexedSeq[String], missingValue: String) - extends EvaluatorWithTransformation[ExportGenotypeWithSA, ExportGenotypePostSA]( + sad: IndexedSeq[AnnotationData], ids: IndexedSeq[String]) + extends EvaluatorWithTransformation[ExportGenotypeEvaluator.ExportGenotypeWithSA, + ExportGenotypeEvaluator.ExportGenotypePostSA]( s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], | __ids: IndexedSeq[String]) => { | import org.broadinstitute.hail.methods.FilterUtils._ | import org.broadinstitute.hail.methods.UserExportUtils._ - | ${signatures(sas, "__sa")} + | import org.broadinstitute.hail.Utils.toTSVString + | ${signatures(sas, "__sa", makeToString = true)} | ${makeIndexedSeq("__saArray", "__sa", "__sa")} | (__v: org.broadinstitute.hail.variant.Variant, | __va: org.broadinstitute.hail.annotations.AnnotationData) => { @@ -92,12 +104,13 @@ class ExportGenotypeEvaluator(list: String, vas: AnnotationSignatures, sas: Anno | g: org.broadinstitute.hail.variant.Genotype) => { | val sa = __saArray(__sIndex) | val s = org.broadinstitute.hail.variant.Sample(__ids(__sIndex)) - | Array($list).map(formatString(_, "$missingValue")).mkString("\t") + | Array($list).map(toTSVString).mkString("\t") | }: String | } | } """.stripMargin, t => t(sad, ids)) { + def apply(v: Variant, va: AnnotationData)(sIndex: Int, g: Genotype): String = eval()(v, va)(sIndex, g) } \ No newline at end of file diff --git a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala index 5f855f04f06..849651a4590 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala @@ -1,12 +1,9 @@ package org.broadinstitute.hail.methods -import org.broadinstitute.hail.Utils -import org.broadinstitute.hail.Utils._ import org.broadinstitute.hail.annotations._ import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ import org.broadinstitute.hail.methods.FilterUtils.{FilterGenotypePostSA, FilterGenotypeWithSA} import org.broadinstitute.hail.variant._ -import scala.reflect.ClassTag import scala.language.implicitConversions class FilterString(val s: String) extends AnyVal { @@ -33,51 +30,6 @@ object FilterUtils { implicit def toConvertibleString(s: String): ConvertibleString = new ConvertibleString(s) } -class EvaluatorWithTransformation[T, S](t: String, f: T => S)(implicit tct: ClassTag[T]) extends Serializable { - @transient var p: Option[S] = None - - def typeCheck() { - require(p.isEmpty) - p = Some(f(Utils.eval[T](t))) - } - - def eval(): S = p match { - case null | None => - val v = f(Utils.eval[T](t)) - p = Some(v) - v - case Some(v) => v - } -} - -class Evaluator[T](t: String)(implicit tct: ClassTag[T]) - extends Serializable { - @transient var p: Option[T] = None - - def typeCheck() { - require(p.isEmpty) - try { - p = Some(Utils.eval[T](t)) - } - catch { - case e: scala.tools.reflect.ToolBoxError => - /* e.message looks like: - reflective compilation has failed: - - ';' expected but '.' found. */ - fatal("parse error in condition: " + e.message.split("\n").last) - } - } - - def eval(): T = p match { - case null | None => - val v = Utils.eval[T](t) - p = Some(v) - v - case Some(v) => v - } -} - class FilterVariantCondition(cond: String, vas: AnnotationSignatures) extends Evaluator[(Variant, AnnotationData) => Boolean]({ s"""(v: org.broadinstitute.hail.variant.Variant, diff --git a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala index 8a61996669a..514f7689fae 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala @@ -51,6 +51,11 @@ object LoadVCF { null } + val filters = header + .getFilterLines + .toList +// .map(line => line.g) + val infoSignatures = header .getInfoHeaderLines .toList diff --git a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala index 575fc6a6b48..3db80311581 100644 --- a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala @@ -1,9 +1,11 @@ package org.broadinstitute.hail.methods import org.broadinstitute.hail.SparkSuite +import org.broadinstitute.hail.annotations._ import org.broadinstitute.hail.driver._ +//import org.broadinstitute.hail.methods.UserExportUtils.toTSVString +import org.broadinstitute.hail.variant.Sample import org.testng.annotations.Test - import scala.io.Source /** @@ -21,6 +23,11 @@ class ExportSuite extends SparkSuite{ SampleQC.run(state, Array("-o", "/tmp/sampleQC")) val postSampleQC = SampleQC.run(state, Array("--store")) +// println(toTSVString(Some(5.1))) +// println(toTSVString(Some(None))) +// println(toTSVString(Array(1,2,3,4,5))) +// println(toTSVString(5.124)) + ExportSamples.run(postSampleQC, Array("-o", "/tmp/exportSamples", "-c", "s.id, sa.qc.nCalled,sa.qc.nNotCalled,sa.qc.nHomRef,sa.qc.nHet,sa.qc.nHomVar,sa.qc.nSNP,sa.qc.nInsertion," + "sa.qc.nDeletion,sa.qc.nSingleton,sa.qc.nTransition,sa.qc.nTransversion,sa.qc.dpMean,sa.qc.dpStDev," + @@ -37,7 +44,6 @@ class ExportSuite extends SparkSuite{ assert(sQcOutput == sExportOutput) VariantQC.run(state, Array("-o", "/tmp/variantQC")) - val postVariantQC = VariantQC.run(state, Array("--store")) ExportVariants.run(postVariantQC, Array("-o", "/tmp/exportVariants", "-c", From 866b4358868c01ab7be4d96d8ac8e6c0e7827f7f Mon Sep 17 00:00:00 2001 From: tpoterba Date: Mon, 21 Dec 2015 12:18:53 -0500 Subject: [PATCH 12/15] Third round of cseed changes implemented --- .../scala/org/broadinstitute/hail/Utils.scala | 15 +--- .../annotations/AnnotationSignature.scala | 8 ++ .../hail/annotations/VCFSignature.scala | 7 +- .../hail/driver/ExportGenotypes.scala | 4 +- .../hail/driver/FilterGenotypes.scala | 10 +-- .../hail/driver/FilterSamples.scala | 9 +-- .../broadinstitute/hail/driver/SampleQC.scala | 69 +++++++++--------- .../hail/driver/VariantQC.scala | 56 +++++++------- .../broadinstitute/hail/driver/Write.scala | 5 +- .../hail/methods/ExportTSV.scala | 73 +++++++++++-------- .../broadinstitute/hail/methods/Filter.scala | 13 ++-- .../broadinstitute/hail/methods/LoadVCF.scala | 39 +++------- .../hail/variant/VariantMetadata.scala | 22 +++--- .../hail/variant/VariantSampleMatrix.scala | 28 ++++--- .../hail/annotations/AnnotationsSuite.scala | 6 +- .../hail/methods/ExportSuite.scala | 12 +-- .../hail/utils/TestRDDBuilder.scala | 2 +- .../hail/variant/vsm/VSMSuite.scala | 13 ++-- 18 files changed, 190 insertions(+), 201 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/Utils.scala b/src/main/scala/org/broadinstitute/hail/Utils.scala index 907981a55a0..2022c8649a1 100644 --- a/src/main/scala/org/broadinstitute/hail/Utils.scala +++ b/src/main/scala/org/broadinstitute/hail/Utils.scala @@ -240,8 +240,7 @@ class RichOption[T](val o: Option[T]) extends AnyVal { class RichStringBuilder(val sb: mutable.StringBuilder) extends AnyVal { def tsvAppend(a: Any) { -// sb.append(org.broadinstitute.hail.methods.UserExportUtils.toTSVString(a)) - sb.append(org.broadinstitute.hail.Utils.toTSVString(a)) + sb.append(org.broadinstitute.hail.methods.UserExportUtils.toTSVString(a)) } } @@ -510,16 +509,4 @@ object Utils { def genDNAString: Gen[String] = Gen.buildableOf[String, Char](genBase) implicit def richIterator[T](it: Iterator[T]): RichIterator[T] = new RichIterator[T](it) - - - def toTSVString(a: Any): String = { - a match { - // case Some(o) => toTSVString(o) - case None => "NA" - case d: Double => d.formatted("%.4e") - // case i: Iterable[_] => i.map(toTSVString).mkString(",") - // case arr: Array[_] => arr.map(toTSVString).mkString(",") - case _ => a.toString - } - } } diff --git a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala index 3eff7ff2fbd..bc4b52132fd 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala @@ -6,3 +6,11 @@ abstract class AnnotationSignature { def emitType: String } + +case class SimpleSignature(emitType: String, emitConversionIdentifier: String, desc: String) extends AnnotationSignature { + + def this(emitType: String, emitConversionIdentifier: String) = this(emitType, emitConversionIdentifier, "") + + def emitUtilities = "" + +} \ No newline at end of file diff --git a/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala index 328badc018c..e097d112a7a 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala @@ -6,9 +6,6 @@ case class VCFSignature(vcfType: String, emitType: String, number: String, emitConversionIdentifier: String, description: String) extends AnnotationSignature { - def this(scalaType: String, conversionMethod: String, desc: String) = - this("", scalaType, "", conversionMethod, "") - def emitUtilities: String = "" } @@ -17,7 +14,7 @@ object VCFSignature { val arrayRegex = """Array\[(\w+)\]""".r val setRegex = """Set\[(\w+)\]""".r - def getConversionMethod(str: String): String = { + def parseConversionIdentifier(str: String): String = { str match { case arrayRegex(subType) => s"toArray$subType" case setRegex(subType) => s"toSet$subType" @@ -59,7 +56,7 @@ object VCFSignature { case integerRegex(i) => if (i.toInt > 1) s"Array[$parsedType]" else parsedType case _ => parsedType } - val conversionMethod = getConversionMethod(scalaType) + val conversionMethod = parseConversionIdentifier(scalaType) val desc = line.getDescription diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala index ecc2b4b7528..0ea8eae465f 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala @@ -37,7 +37,7 @@ object ExportGenotypes extends Command { val makeString: ((Variant, AnnotationData) => ((Int, Genotype) => String)) = { - val cf = new ExportGenotypeEvaluator(options.condition, vas, sas, sa, ids) + val cf = new ExportGenotypeEvaluator(options.condition, vds.metadata) cf.typeCheck() cf.apply } @@ -47,7 +47,7 @@ object ExportGenotypes extends Command { (s: Int, g: Genotype) => makeString(v, va)(s, g)) - // FIXME add additional command parsing functionality + // FIXME add additional command parsing functionality. Somewhat hacky val variantRegex = """v\.(\w+)""".r val sampleRegex = """s\.(\w+)""".r diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala index f322f33edb4..158bedf23cc 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala @@ -28,16 +28,16 @@ object FilterGenotypes extends Command { def run(state: State, options: Options): State = { val vds = state.vds - val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures - val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures - val ids = vds.sampleIds - val sa = vds.metadata.sampleAnnotations +// val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures +// val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures +// val ids = vds.sampleIds +// val sa = vds.metadata.sampleAnnotations if (!options.keep && !options.remove) fatal(name + ": one of `--keep' or `--remove' required") val p: ((Variant, AnnotationData) => ((Int, Genotype) => Boolean)) = { - val cf = new FilterGenotypeCondition(options.condition, vas, sas, sa, ids) + val cf = new FilterGenotypeCondition(options.condition, vds.metadata) cf.typeCheck() cf.apply } diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala index 3ea5feefc82..3e64a432a47 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala @@ -30,12 +30,11 @@ object FilterSamples extends Command { def run(state: State, options: Options): State = { val vds = state.vds - val sas: AnnotationSignatures = state.vds.metadata.sampleAnnotationSignatures if (!options.keep && !options.remove) fatal(name + ": one of `--keep' or `--remove' required") - val indexOfSample: Map[String, Int] = state.vds.sampleIds.zipWithIndex.toMap + val indexOfSample: Map[String, Int] = vds.sampleIds.zipWithIndex.toMap val p = options.condition match { case f if f.endsWith(".sample_list") => @@ -46,11 +45,11 @@ object FilterSamples extends Command { .toSet (s: Int, sa: AnnotationData) => samples.contains(s) case c: String => - val cf = new FilterSampleCondition(c, sas) + val cf = new FilterSampleCondition(c, vds.metadata.sampleAnnotationSignatures) cf.typeCheck() - val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) - (s: Int, sa: AnnotationData) => cf(Sample(sampleIdsBc.value(s)), state.vds.metadata.sampleAnnotations(s)) + val sampleIdsBc = state.sc.broadcast(vds.sampleIds) + (s: Int, sa: AnnotationData) => cf(Sample(sampleIdsBc.value(s)), vds.metadata.sampleAnnotations(s)) } val newVDS = vds.filterSamples(if (options.keep) diff --git a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala index 3ccaeca70cc..e14b50ddc51 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala @@ -36,37 +36,37 @@ object SampleQCCombiner { "rHetHomVar\t" + "rDeletionInsertion" - val signatures = Map("nCalled" -> new VCFSignature("Int", "toInt", ""), - "nNotCalled" -> new VCFSignature("Int", "toInt", ""), - "nHomRef" -> new VCFSignature("Int", "toInt", ""), - "nHet" -> new VCFSignature("Int", "toInt", ""), - "nHomVar" -> new VCFSignature("Int", "toInt", ""), - "nSNP" -> new VCFSignature("Int", "toInt", ""), - "nInsertion" -> new VCFSignature("Int", "toInt", ""), - "nDeletion" -> new VCFSignature("Int", "toInt", ""), - "nSingleton" -> new VCFSignature("Int", "toInt", ""), - "nTransition" -> new VCFSignature("Int", "toInt", ""), - "nTransversion" -> new VCFSignature("Int", "toInt", ""), - "dpMean" -> new VCFSignature("Double", "toDouble", ""), - "dpStDev" -> new VCFSignature("Double", "toDouble", ""), - "dpMeanHomRef" -> new VCFSignature("Double", "toDouble", ""), - "dpStDevHomRef" -> new VCFSignature("Double", "toDouble", ""), - "dpMeanHet" -> new VCFSignature("Double", "toDouble", ""), - "dpStDevHet" -> new VCFSignature("Double", "toDouble", ""), - "dpMeanHomVar" -> new VCFSignature("Double", "toDouble", ""), - "dpStDevHomVar" -> new VCFSignature("Double", "toDouble", ""), - "gqMean" -> new VCFSignature("Double", "toDouble", ""), - "gqStDev" -> new VCFSignature("Double", "toDouble", ""), - "gqMeanHomRef" -> new VCFSignature("Double", "toDouble", ""), - "gqStDevHomRef" -> new VCFSignature("Double", "toDouble", ""), - "gqMeanHet" -> new VCFSignature("Double", "toDouble", ""), - "gqStDevHet" -> new VCFSignature("Double", "toDouble", ""), - "gqMeanHomVar" -> new VCFSignature("Double", "toDouble", ""), - "gqStDevHomVar" -> new VCFSignature("Double", "toDouble", ""), - "nNonRef" -> new VCFSignature("Int", "toInt", ""), - "rTiTv" -> new VCFSignature("Double", "toDouble", ""), - "rHetHomVar" -> new VCFSignature("Double", "toDouble", ""), - "rDeletionInsertion" -> new VCFSignature("Double", "toDouble", "")) + val signatures = Map("nCalled" -> new SimpleSignature("Int", "toInt"), + "nNotCalled" -> new SimpleSignature("Int", "toInt"), + "nHomRef" -> new SimpleSignature("Int", "toInt"), + "nHet" -> new SimpleSignature("Int", "toInt"), + "nHomVar" -> new SimpleSignature("Int", "toInt"), + "nSNP" -> new SimpleSignature("Int", "toInt"), + "nInsertion" -> new SimpleSignature("Int", "toInt"), + "nDeletion" -> new SimpleSignature("Int", "toInt"), + "nSingleton" -> new SimpleSignature("Int", "toInt"), + "nTransition" -> new SimpleSignature("Int", "toInt"), + "nTransversion" -> new SimpleSignature("Int", "toInt"), + "dpMean" -> new SimpleSignature("Double", "toDouble"), + "dpStDev" -> new SimpleSignature("Double", "toDouble"), + "dpMeanHomRef" -> new SimpleSignature("Double", "toDouble"), + "dpStDevHomRef" -> new SimpleSignature("Double", "toDouble"), + "dpMeanHet" -> new SimpleSignature("Double", "toDouble"), + "dpStDevHet" -> new SimpleSignature("Double", "toDouble"), + "dpMeanHomVar" -> new SimpleSignature("Double", "toDouble"), + "dpStDevHomVar" -> new SimpleSignature("Double", "toDouble"), + "gqMean" -> new SimpleSignature("Double", "toDouble"), + "gqStDev" -> new SimpleSignature("Double", "toDouble"), + "gqMeanHomRef" -> new SimpleSignature("Double", "toDouble"), + "gqStDevHomRef" -> new SimpleSignature("Double", "toDouble"), + "gqMeanHet" -> new SimpleSignature("Double", "toDouble"), + "gqStDevHet" -> new SimpleSignature("Double", "toDouble"), + "gqMeanHomVar" -> new SimpleSignature("Double", "toDouble"), + "gqStDevHomVar" -> new SimpleSignature("Double", "toDouble"), + "nNonRef" -> new SimpleSignature("Int", "toInt"), + "rTiTv" -> new SimpleSignature("Double", "toDouble"), + "rHetHomVar" -> new SimpleSignature("Double", "toDouble"), + "rDeletionInsertion" -> new SimpleSignature("Double", "toDouble")) } class SampleQCCombiner extends Serializable { @@ -348,11 +348,11 @@ object SampleQC extends Command { val singletons = sSingletonVariants(vds) val sampleIdsBc = state.sc.broadcast(vds.sampleIds) + val r = results(vds) if (options.store) { - val r = results(vds).collectAsMap() val newAnnotations = vds.metadata.sampleAnnotations .zipWithIndex - .map { case (sa, s) => sa.addMap("qc", r(s).asMap) } + .map { case (sa, s) => sa.addMap("qc", r.collectAsMap()(s).asMap) } state.copy( vds = vds.copy( metadata = vds.metadata.copy( @@ -367,8 +367,7 @@ object SampleQC extends Command { } hadoopDelete(output, state.hadoopConf, recursive = true) - val r = results(vds) - .map { case (s, comb) => + r.map { case (s, comb) => val sb = new StringBuilder() sb.append(sampleIdsBc.value(s)) sb += '\t' diff --git a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala index abade95d776..7ad6111dcd5 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala @@ -31,33 +31,33 @@ object VariantQCCombiner { "rHetHomVar\t" + "rExpectedHetFrequency\tpHWE\t" - val signatures = Map("nCalled" -> new VCFSignature("Int", "toInt", ""), - "nNotCalled" -> new VCFSignature("Int", "toInt", ""), - "nHomRef" -> new VCFSignature("Int", "toInt", ""), - "nHet" -> new VCFSignature("Int", "toInt", ""), - "nHomVar" -> new VCFSignature("Int", "toInt", ""), - "dpMean" -> new VCFSignature("Double", "toDouble", ""), - "dpStDev" -> new VCFSignature("Double", "toDouble", ""), - "dpMeanHomRef" -> new VCFSignature("Double", "toDouble", ""), - "dpStDevHomRef" -> new VCFSignature("Double", "toDouble", ""), - "dpMeanHet" -> new VCFSignature("Double", "toDouble", ""), - "dpStDevHet" -> new VCFSignature("Double", "toDouble", ""), - "dpMeanHomVar" -> new VCFSignature("Double", "toDouble", ""), - "dpStDevHomVar" -> new VCFSignature("Double", "toDouble", ""), - "gqMean" -> new VCFSignature("Double", "toDouble", ""), - "gqStDev" -> new VCFSignature("Double", "toDouble", ""), - "gqMeanHomRef" -> new VCFSignature("Double", "toDouble", ""), - "gqStDevHomRef" -> new VCFSignature("Double", "toDouble", ""), - "gqMeanHet" -> new VCFSignature("Double", "toDouble", ""), - "gqStDevHet" -> new VCFSignature("Double", "toDouble", ""), - "gqMeanHomVar" -> new VCFSignature("Double", "toDouble", ""), - "gqStDevHomVar" -> new VCFSignature("Double", "toDouble", ""), - "MAF" -> new VCFSignature("Double", "toDouble", ""), - "nNonRef" -> new VCFSignature("Int", "toInt", ""), - "rHeterozygosity" -> new VCFSignature("Double", "toDouble", ""), - "rHetHomVar" -> new VCFSignature("Double", "toDouble", ""), - "rExpectedHetFrequency" -> new VCFSignature("Double", "toDouble", ""), - "pHWE" -> new VCFSignature("Double", "toDouble", "")) + val signatures = Map("nCalled" -> new SimpleSignature("Int", "toInt"), + "nNotCalled" -> new SimpleSignature("Int", "toInt"), + "nHomRef" -> new SimpleSignature("Int", "toInt"), + "nHet" -> new SimpleSignature("Int", "toInt"), + "nHomVar" -> new SimpleSignature("Int", "toInt"), + "dpMean" -> new SimpleSignature("Double", "toDouble"), + "dpStDev" -> new SimpleSignature("Double", "toDouble"), + "dpMeanHomRef" -> new SimpleSignature("Double", "toDouble"), + "dpStDevHomRef" -> new SimpleSignature("Double", "toDouble"), + "dpMeanHet" -> new SimpleSignature("Double", "toDouble"), + "dpStDevHet" -> new SimpleSignature("Double", "toDouble"), + "dpMeanHomVar" -> new SimpleSignature("Double", "toDouble"), + "dpStDevHomVar" -> new SimpleSignature("Double", "toDouble"), + "gqMean" -> new SimpleSignature("Double", "toDouble"), + "gqStDev" -> new SimpleSignature("Double", "toDouble"), + "gqMeanHomRef" -> new SimpleSignature("Double", "toDouble"), + "gqStDevHomRef" -> new SimpleSignature("Double", "toDouble"), + "gqMeanHet" -> new SimpleSignature("Double", "toDouble"), + "gqStDevHet" -> new SimpleSignature("Double", "toDouble"), + "gqMeanHomVar" -> new SimpleSignature("Double", "toDouble"), + "gqStDevHomVar" -> new SimpleSignature("Double", "toDouble"), + "MAF" -> new SimpleSignature("Double", "toDouble"), + "nNonRef" -> new SimpleSignature("Int", "toInt"), + "rHeterozygosity" -> new SimpleSignature("Double", "toDouble"), + "rHetHomVar" -> new SimpleSignature("Double", "toDouble"), + "rExpectedHetFrequency" -> new SimpleSignature("Double", "toDouble"), + "pHWE" -> new SimpleSignature("Double", "toDouble")) } class VariantQCCombiner extends Serializable { @@ -278,7 +278,7 @@ object VariantQC extends Command { state.copy(vds = vds.mapAnnotationsWithAggregate(new VariantQCCombiner)((comb, v, s, g) => comb.merge(g), (comb1, comb2) => comb1.merge(comb2), (ad: AnnotationData, comb: VariantQCCombiner) => ad.addMap("qc", comb.asMap)) - .addVariantSignatures(Map("qc" -> VariantQCCombiner.signatures))) + .addVariantMapSignatures("qc", VariantQCCombiner.signatures)) else { writeTextFile(output + ".header", state.hadoopConf) { s => s.write("Chrom\tPos\tRef\tAlt\t") diff --git a/src/main/scala/org/broadinstitute/hail/driver/Write.scala b/src/main/scala/org/broadinstitute/hail/driver/Write.scala index b3db7b150f0..0bd1ab8d2b6 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/Write.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/Write.scala @@ -7,6 +7,9 @@ object Write extends Command { class Options extends BaseOptions { @Args4jOption(required = true, name = "-o", aliases = Array("--output"), usage = "Output file") var output: String = _ + + @Args4jOption(required = false, name = "--compress", usage = "compress genotype streams using LZ4") + var compress: Boolean = true } def newOptions = new Options @@ -15,7 +18,7 @@ object Write extends Command { def run(state: State, options: Options): State = { hadoopDelete(options.output, state.hadoopConf, true) - state.vds.write(state.sqlContext, options.output) + state.vds.write(state.sqlContext, options.output, compress = options.compress) state } } diff --git a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala index 904d15e7c72..a03b79b6504 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala @@ -2,10 +2,11 @@ package org.broadinstitute.hail.methods import org.broadinstitute.hail.annotations.AnnotationClassBuilder._ import org.broadinstitute.hail.annotations._ -import org.broadinstitute.hail.variant.{Sample, Variant, Genotype} +import org.broadinstitute.hail.variant.{VariantMetadata, Sample, Variant, Genotype} import scala.language.implicitConversions object UserExportUtils { + class ExportVariant(val v: Variant) extends AnyVal { def contig = v.contig @@ -44,6 +45,17 @@ object UserExportUtils { } } + // FIXME move to Utils after we figure out what is calling the illegal cyclic operation bug + def toTSVString(a: Any): String = { + a match { + case Some(o) => toTSVString(o) + case None => "NA" + case d: Double => d.formatted("%.4e") + case i: Iterable[_] => i.map(toTSVString).mkString(",") + case arr: Array[_] => arr.map(toTSVString).mkString(",") + case _ => a.toString + } + } } class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures) @@ -53,28 +65,28 @@ class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures) | import org.broadinstitute.hail.methods.FilterUtils._ | import org.broadinstitute.hail.methods.UserExportUtils._ | - | | val v: ExportVariant = new ExportVariant(__v) | ${signatures(vas, "__va", makeToString = true)} | ${instantiate("va", "__va")} - | Array($list).map(toTSVString).mkString("\t") + | Array($list).map(toTSVString).mkString("\t") |}: String - """.stripMargin}) { + """.stripMargin + }) { def apply(v: Variant, va: AnnotationData): String = eval()(v, va) } class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures) extends Evaluator[(Sample, AnnotationData) => String]( - {val s = s"""(s: org.broadinstitute.hail.variant.Sample, + s"""(s: org.broadinstitute.hail.variant.Sample, | __sa: org.broadinstitute.hail.annotations.AnnotationData) => { | import org.broadinstitute.hail.methods.FilterUtils._ | import org.broadinstitute.hail.methods.UserExportUtils._ - | import org.broadinstitute.hail.Utils.toTSVString + | | ${signatures(sas, "__sa", makeToString = true)} | ${instantiate("sa", "__sa")} | Array($list).map(toTSVString).mkString("\t") |}: String - """.stripMargin;println(s);s}) { + """.stripMargin) { def apply(s: Sample, sa: AnnotationData): String = eval()(s, sa) } @@ -84,33 +96,32 @@ object ExportGenotypeEvaluator { type ExportGenotypePostSA = (Variant, AnnotationData) => ((Int, Genotype) => String) } -class ExportGenotypeEvaluator(list: String, vas: AnnotationSignatures, sas: AnnotationSignatures, - sad: IndexedSeq[AnnotationData], ids: IndexedSeq[String]) +class ExportGenotypeEvaluator(list: String, metadata: VariantMetadata) extends EvaluatorWithTransformation[ExportGenotypeEvaluator.ExportGenotypeWithSA, ExportGenotypeEvaluator.ExportGenotypePostSA]( - s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], - | __ids: IndexedSeq[String]) => { - | import org.broadinstitute.hail.methods.FilterUtils._ - | import org.broadinstitute.hail.methods.UserExportUtils._ - | import org.broadinstitute.hail.Utils.toTSVString - | ${signatures(sas, "__sa", makeToString = true)} - | ${makeIndexedSeq("__saArray", "__sa", "__sa")} - | (__v: org.broadinstitute.hail.variant.Variant, - | __va: org.broadinstitute.hail.annotations.AnnotationData) => { - | val v = new ExportVariant(__v) - | ${signatures(vas, "__va")} - | ${instantiate("va", "__va")} - | (__sIndex: Int, - | g: org.broadinstitute.hail.variant.Genotype) => { - | val sa = __saArray(__sIndex) - | val s = org.broadinstitute.hail.variant.Sample(__ids(__sIndex)) - | Array($list).map(toTSVString).mkString("\t") - | }: String - | } - | } + s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], + | __ids: IndexedSeq[String]) => { + | import org.broadinstitute.hail.methods.FilterUtils._ + | import org.broadinstitute.hail.methods.UserExportUtils._ + | + | ${signatures(metadata.sampleAnnotationSignatures, "__sa", makeToString = true)} + | ${makeIndexedSeq("__saArray", "__sa", "__sa")} + | (__v: org.broadinstitute.hail.variant.Variant, + | __va: org.broadinstitute.hail.annotations.AnnotationData) => { + | val v = new ExportVariant(__v) + | ${signatures(metadata.variantAnnotationSignatures, "__va")} + | ${instantiate("va", "__va")} + | (__sIndex: Int, + | g: org.broadinstitute.hail.variant.Genotype) => { + | val sa = __saArray(__sIndex) + | val s = org.broadinstitute.hail.variant.Sample(__ids(__sIndex)) + | Array($list).map(toTSVString).mkString("\t") + | }: String + | } + | } """.stripMargin, - t => t(sad, ids)) { + t => t(metadata.sampleAnnotations, metadata.sampleIds)) { def apply(v: Variant, va: AnnotationData)(sIndex: Int, g: Genotype): String = eval()(v, va)(sIndex, g) -} \ No newline at end of file +} diff --git a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala index 849651a4590..33ea26c934e 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala @@ -12,7 +12,7 @@ class FilterString(val s: String) extends AnyVal { def !~(t: String): Boolean = !this.~(t) } -class ConvertibleString(val s: String) extends AnyVal { +class AnnotationValueString(val s: String) extends AnyVal { def toArrayInt: Array[Int] = s.split(",").map(i => i.toInt) def toArrayDouble: Array[Double] = s.split(",").map(i => i.toDouble) @@ -27,7 +27,7 @@ object FilterUtils { implicit def toFilterString(s: String): FilterString = new FilterString(s) - implicit def toConvertibleString(s: String): ConvertibleString = new ConvertibleString(s) + implicit def toAnnotationValueString(s: String): AnnotationValueString = new AnnotationValueString(s) } class FilterVariantCondition(cond: String, vas: AnnotationSignatures) @@ -57,17 +57,16 @@ class FilterSampleCondition(cond: String, sas: AnnotationSignatures) def apply(s: Sample, sa: AnnotationData): Boolean = eval()(s, sa) } -class FilterGenotypeCondition(cond: String, vas: AnnotationSignatures, sas: AnnotationSignatures, - sad: IndexedSeq[AnnotationData], ids: IndexedSeq[String]) +class FilterGenotypeCondition(cond: String, metadata: VariantMetadata) extends EvaluatorWithTransformation[FilterGenotypeWithSA, FilterGenotypePostSA]( s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], | __ids: IndexedSeq[String]) => { | import org.broadinstitute.hail.methods.FilterUtils._ - | ${signatures(sas, "__sa")} + | ${signatures(metadata.sampleAnnotationSignatures, "__sa")} | ${makeIndexedSeq("__saArray", "__sa", "__sa")} | (v: org.broadinstitute.hail.variant.Variant, | __va: org.broadinstitute.hail.annotations.AnnotationData) => { - | ${signatures(vas, "__va")} + | ${signatures(metadata.variantAnnotationSignatures, "__va")} | ${instantiate("va", "__va")} | (__sIndex: Int, | g: org.broadinstitute.hail.variant.Genotype) => { @@ -78,7 +77,7 @@ class FilterGenotypeCondition(cond: String, vas: AnnotationSignatures, sas: Anno | } | } """.stripMargin, - t => t(sad, ids)) { + t => t(metadata.sampleAnnotations, metadata.sampleIds)) { def apply(v: Variant, va: AnnotationData)(sIndex: Int, g: Genotype): Boolean = eval()(v, va)(sIndex, g) } diff --git a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala index 514f7689fae..9da245e9f64 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala @@ -35,26 +35,11 @@ object LoadVCF { .getHeaderValue .asInstanceOf[htsjdk.variant.vcf.VCFHeader] - // FIXME: use htsjdk to parse contigs when they expose the correct fields - val contigRegex ="""##contig=""".r - val contigs = { - val contigMap = headerLines.map { - case contigRegex(id, length) => - Some((id, length.toInt)) - case _ => None - }.flatMap(i => i) - .toMap - - if (contigMap.nonEmpty) - contigMap - else - null - } - - val filters = header + // FIXME get descriptions when HTSJDK is fixed to expose filter descriptions + val filters: List[(String, String)] = header .getFilterLines .toList -// .map(line => line.g) + .map(line => (line.getID, "")) val infoSignatures = header .getInfoHeaderLines @@ -63,11 +48,11 @@ object LoadVCF { .toMap val annotationSignatures: AnnotationSignatures = Annotations[AnnotationSignature](Map("info" -> infoSignatures), - Map("filters" -> new VCFSignature("Set[String]","toSetString", "filters applied to site"), - "pass" -> new VCFSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"), - "multiallelic" -> new VCFSignature("Boolean", "toBoolean", "Site is a split multiallelic"), - "qual" -> new VCFSignature("Double", "toDouble", "vcf qual field"), - "rsid" -> new VCFSignature("String", "toString", "site rsID"))) + Map("filters" -> new SimpleSignature("Set[String]","toSetString", "filters applied to site"), + "pass" -> new SimpleSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"), + "multiallelic" -> new SimpleSignature("Boolean", "toBoolean", "Site is a split multiallelic"), + "qual" -> new SimpleSignature("Double", "toDouble", "vcf qual field"), + "rsid" -> new SimpleSignature("String", "toString", "site rsID"))) val headerLine = headerLines.last assert(headerLine(0) == '#' && headerLine(1) != '#') @@ -76,9 +61,6 @@ object LoadVCF { .split("\t") .drop(9) - val sampleAnnotations = Annotations.emptyOfArrayString(sampleIds.length) - val sampleAnnotationSignatures = Annotations.emptyOfSignature() - val headerLinesBc = sc.broadcast(headerLines) val genotypes = sc.textFile(file, nPartitions.getOrElse(sc.defaultMinPartitions)) .mapPartitions { lines => @@ -93,7 +75,8 @@ object LoadVCF { } } - VariantSampleMatrix(VariantMetadata(contigs, sampleIds, - headerLines, sampleAnnotations, sampleAnnotationSignatures, annotationSignatures), genotypes) + VariantSampleMatrix(VariantMetadata(filters, sampleIds, + headerLines, Annotations.emptyOfArrayString(sampleIds.length), Annotations.emptyOfSignature(), + annotationSignatures), genotypes) } } diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala index ed78b6c3496..9b068a727d2 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala @@ -3,30 +3,28 @@ package org.broadinstitute.hail.variant import org.broadinstitute.hail.annotations._ object VariantMetadata { - def apply(contigLength: Map[String, Int], - sampleIds: Array[String]): VariantMetadata = new VariantMetadata(contigLength, sampleIds, None, - Annotations.emptyOfArrayString(sampleIds.length), Annotations.emptyOfSignature(), Annotations.emptyOfSignature()) + def apply(sampleIds: Array[String]): VariantMetadata = new VariantMetadata(Seq.empty[(String, String)], + sampleIds, None, Annotations.emptyOfArrayString(sampleIds.length), Annotations.emptyOfSignature(), + Annotations.emptyOfSignature()) - def apply(contigLength: Map[String, Int], - sampleIds: Array[String], - vcfHeader: Array[String]): VariantMetadata = new VariantMetadata(contigLength, sampleIds, Some(vcfHeader), - Annotations.emptyOfArrayString(sampleIds.length), Annotations.emptyOfSignature(), Annotations.emptyOfSignature()) + def apply(sampleIds: Array[String], + vcfHeader: Array[String]): VariantMetadata = new VariantMetadata(Seq.empty[(String, String)], + sampleIds, Some(vcfHeader), Annotations.emptyOfArrayString(sampleIds.length), + Annotations.emptyOfSignature(), Annotations.emptyOfSignature()) - def apply(contigLength: Map[String, Int], sampleIds: Array[String], vcfHeader: Array[String], + def apply(filters: Seq[(String, String)], sampleIds: Array[String], vcfHeader: Array[String], sa: IndexedSeq[AnnotationData], sas: AnnotationSignatures, vas: AnnotationSignatures): VariantMetadata = { - new VariantMetadata(contigLength, sampleIds, Some(vcfHeader), sa, sas, vas) + new VariantMetadata(filters, sampleIds, Some(vcfHeader), sa, sas, vas) } } -case class VariantMetadata(contigLength: Map[String, Int], +case class VariantMetadata(filters: Seq[(String, String)], sampleIds: IndexedSeq[String], vcfHeader: Option[IndexedSeq[String]], sampleAnnotations: IndexedSeq[AnnotationData], sampleAnnotationSignatures: AnnotationSignatures, variantAnnotationSignatures: AnnotationSignatures) { - def nContigs: Int = contigLength.size - def nSamples: Int = sampleIds.length def addSampleAnnotations(sas: AnnotationSignatures, sa: IndexedSeq[AnnotationData]): VariantMetadata = { diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala index aba091a53c7..bb997cb850b 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala @@ -102,9 +102,9 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, (implicit utt: TypeTag[U], uct: ClassTag[U]): VariantSampleMatrix[U] = { val localSamplesBc = sparkContext.broadcast(localSamples) copy(rdd = rdd.map { case (v, va, gs) => - val fPrime = f(v, va) + val f2 = f(v, va) (v, va, localSamplesBc.value.view.zip(gs.view) - .map { case (s, t) => fPrime(s, t) }) + .map { case (s, t) => f2(s, t) }) }) } @@ -289,16 +289,24 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, }) } - def addVariantSignatures(maps: Map[String, Map[String, AnnotationSignature]] = Map.empty[String, Map[String, AnnotationSignature]], - vals: Map[String, AnnotationSignature] = Map.empty[String, AnnotationSignature]): VariantSampleMatrix[T] = { - this.copy(metadata = this.metadata.copy(variantAnnotationSignatures = - this.metadata.variantAnnotationSignatures.addMaps(maps).addVals(vals))) + def addVariantMapSignatures(mapName: String, map: Map[String, AnnotationSignature]): VariantSampleMatrix[T] = { + this.copy(metadata = metadata.copy(variantAnnotationSignatures = + metadata.variantAnnotationSignatures.addMap(mapName, map))) } - def addSampleSignatures(maps: Map[String, Map[String, AnnotationSignature]] = Map.empty[String, Map[String, AnnotationSignature]], - vals: Map[String, AnnotationSignature] = Map.empty[String, AnnotationSignature]): VariantSampleMatrix[T] = { - this.copy(metadata = this.metadata.copy(sampleAnnotationSignatures = - this.metadata.sampleAnnotationSignatures.addMaps(maps).addVals(vals))) + def addVariantValSignature(name: String, sig: AnnotationSignature): VariantSampleMatrix[T] = { + this.copy(metadata = metadata.copy(variantAnnotationSignatures = + metadata.variantAnnotationSignatures.addVal(name, sig))) + } + + def addSampleMapSignatures(mapName: String, map: Map[String, AnnotationSignature]): VariantSampleMatrix[T] = { + this.copy(metadata = metadata.copy(sampleAnnotationSignatures = + metadata.sampleAnnotationSignatures.addMap(mapName, map))) + } + + def addSampleValSignature(name: String, sig: AnnotationSignature): VariantSampleMatrix[T] = { + this.copy(metadata = metadata.copy(sampleAnnotationSignatures = + metadata.sampleAnnotationSignatures.addVal(name, sig))) } } diff --git a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala index 5f044407fd7..16dca23de63 100644 --- a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala @@ -6,7 +6,7 @@ import org.broadinstitute.hail.driver._ import org.broadinstitute.hail.variant.{Genotype, IntervalList, Variant} import org.testng.annotations.Test import org.broadinstitute.hail.methods._ -import org.broadinstitute.hail.methods.FilterUtils.toConvertibleString +import org.broadinstitute.hail.methods.FilterUtils.toAnnotationValueString import scala.language.implicitConversions /** @@ -97,7 +97,7 @@ class AnnotationsSuite extends SparkSuite { .containsInMap("info", "DB")) //type Set[String] - assert(vas.getVal("filters").contains(new VCFSignature("Set[String]", "toSetString", "filters applied to site"))) + assert(vas.getVal("filters").contains(new SimpleSignature("Set[String]", "toSetString", "filters applied to site"))) assert(variantAnnotationMap(firstVariant) .getVal("filters").contains("PASS") && variantAnnotationMap(firstVariant) @@ -108,7 +108,7 @@ class AnnotationsSuite extends SparkSuite { .getVal("filters").get.toSetString == Set[String]("VQSRTrancheSNP99.95to100.00")) // GATK PASS - assert(vas.getVal("pass").contains(new VCFSignature("Boolean", "toBoolean", + assert(vas.getVal("pass").contains(new SimpleSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"))) assert(variantAnnotationMap(firstVariant) .getVal("pass").contains("true")) diff --git a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala index 3db80311581..23985e7d652 100644 --- a/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/methods/ExportSuite.scala @@ -3,7 +3,7 @@ package org.broadinstitute.hail.methods import org.broadinstitute.hail.SparkSuite import org.broadinstitute.hail.annotations._ import org.broadinstitute.hail.driver._ -//import org.broadinstitute.hail.methods.UserExportUtils.toTSVString +import org.broadinstitute.hail.methods.UserExportUtils.toTSVString import org.broadinstitute.hail.variant.Sample import org.testng.annotations.Test import scala.io.Source @@ -14,7 +14,7 @@ import scala.io.Source * their output agrees with [[org.broadinstitute.hail.driver.VariantQC]] and * [[org.broadinstitute.hail.driver.SampleQC]] commands. */ -class ExportSuite extends SparkSuite{ +class ExportSuite extends SparkSuite { @Test def test() { val vds = LoadVCF(sc, "src/test/resources/sample.vcf") @@ -23,10 +23,10 @@ class ExportSuite extends SparkSuite{ SampleQC.run(state, Array("-o", "/tmp/sampleQC")) val postSampleQC = SampleQC.run(state, Array("--store")) -// println(toTSVString(Some(5.1))) -// println(toTSVString(Some(None))) -// println(toTSVString(Array(1,2,3,4,5))) -// println(toTSVString(5.124)) + assert(toTSVString(Some(5.1)) == "5.1000e+00") + assert(toTSVString(Some(None)) == "NA") + assert(toTSVString(Array(1,2,3,4,5)) == "1,2,3,4,5") + assert(toTSVString(5.124) == "5.1240e+00") ExportSamples.run(postSampleQC, Array("-o", "/tmp/exportSamples", "-c", "s.id, sa.qc.nCalled,sa.qc.nNotCalled,sa.qc.nHomRef,sa.qc.nHet,sa.qc.nHomVar,sa.qc.nSNP,sa.qc.nInsertion," + diff --git a/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala b/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala index b202236a188..9b0c6749998 100644 --- a/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala +++ b/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala @@ -121,6 +121,6 @@ object TestRDDBuilder { } (variant, Annotations.emptyOfString(), b.result(): Iterable[Genotype]) } - VariantSampleMatrix(VariantMetadata(Map("1" -> 1000000), sampleList), streamRDD) + VariantSampleMatrix(VariantMetadata(sampleList), streamRDD) } } \ No newline at end of file diff --git a/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala b/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala index 2490f3a96b1..64dc35017f4 100644 --- a/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala @@ -17,13 +17,10 @@ class VSMSuite extends SparkSuite { val vds2 = LoadVCF(sc, "src/test/resources/sample.vcf.gz") assert(vds1.same(vds2)) - val mdata1 = VariantMetadata(Map("1" -> 10, "2" -> 10), Array("S1", "S2", "S3")) - val mdata2 = VariantMetadata(Map("1" -> 10, "2" -> 20), Array("S1", "S2", "S3")) - val mdata3 = VariantMetadata(Map("1" -> 10), Array("S1", "S2")) + val mdata1 = VariantMetadata(Array("S1", "S2", "S3")) + val mdata2 = VariantMetadata(Array("S1", "S2")) assert(mdata1 != mdata2) - assert(mdata1 != mdata3) - assert(mdata2 != mdata3) val v1 = Variant("1", 1, "A", "T") val v2 = Variant("1", 2, "T", "G") @@ -62,7 +59,7 @@ class VSMSuite extends SparkSuite { Genotype(0, (11, 0), 11, (0, 10, 100)), Genotype(1, (6, 6), 12, (50, 0, 50)))))) - // for mdata3 + // for mdata2 val rdd4 = sc.parallelize(Seq((v1, va1, Iterable(Genotype(-1, (0, 2), 2, null), Genotype(0, (11, 1), 12, (0, 10, 100)))), @@ -91,8 +88,8 @@ class VSMSuite extends SparkSuite { new VariantDataset(mdata2, rdd1), new VariantDataset(mdata2, rdd2), new VariantDataset(mdata2, rdd3), - new VariantDataset(mdata3, rdd4), - new VariantDataset(mdata3, rdd5), + new VariantDataset(mdata2, rdd4), + new VariantDataset(mdata2, rdd5), new VariantDataset(mdata1, rdd6)) for (i <- vdss.indices; From 205a4a7d4c4bf89fb00ae330074aa26beed444b6 Mon Sep 17 00:00:00 2001 From: tpoterba Date: Mon, 21 Dec 2015 15:53:08 -0500 Subject: [PATCH 13/15] Fourth round of comments from cseed integrated --- .../annotations/AnnotationSignature.scala | 4 +- .../hail/annotations/Annotations.scala | 44 +++++-------------- .../hail/annotations/VCFSignature.scala | 14 +----- .../hail/driver/FilterGenotypes.scala | 4 -- .../hail/driver/FilterSamples.scala | 3 +- .../broadinstitute/hail/driver/SampleQC.scala | 4 +- .../hail/methods/ExportTSV.scala | 16 +++---- .../broadinstitute/hail/methods/Filter.scala | 16 +++---- .../broadinstitute/hail/methods/LoadVCF.scala | 14 +++--- .../hail/variant/VariantMetadata.scala | 6 +-- .../hail/variant/VariantSampleMatrix.scala | 17 +++---- .../hail/annotations/AnnotationsSuite.scala | 5 +-- .../hail/methods/FilterSuite.scala | 11 ++--- .../hail/variant/vsm/VSMSuite.scala | 12 ++++- 14 files changed, 72 insertions(+), 98 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala index bc4b52132fd..f1ba82a1363 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala @@ -7,9 +7,7 @@ abstract class AnnotationSignature { } -case class SimpleSignature(emitType: String, emitConversionIdentifier: String, desc: String) extends AnnotationSignature { - - def this(emitType: String, emitConversionIdentifier: String) = this(emitType, emitConversionIdentifier, "") +case class SimpleSignature(emitType: String, emitConversionIdentifier: String) extends AnnotationSignature { def emitUtilities = "" diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index 0655cd9f152..cdc46c6317b 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -2,8 +2,6 @@ package org.broadinstitute.hail.annotations case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T]) extends Serializable { - def nAttrs: Int = maps.map(_._2.size).sum + vals.size - def hasMap(str: String): Boolean = maps.contains(str) def containsVal(str: String): Boolean = vals.contains(str) @@ -42,8 +40,7 @@ object Annotations { def emptyOfString(): AnnotationData = empty[String]() def emptyOfArrayString(nSamples: Int): IndexedSeq[AnnotationData] = - (0 until nSamples) - .map(i => empty[String]()) + IndexedSeq.fill[Annotations[String]](nSamples)(empty[String]()) } object AnnotationUtils { @@ -58,7 +55,7 @@ object AnnotationUtils { object AnnotationClassBuilder { - def signatures(sigs: AnnotationSignatures, hiddenClassName: String, + def signatures(sigs: AnnotationSignatures, className: String, makeToString: Boolean = false): String = { val internalClasses = sigs.maps.map { case (subclass, subMap) => @@ -76,7 +73,7 @@ object AnnotationClassBuilder { | def all: String = __fields.mkString("\t")""".stripMargin } else "" } - s"""class __${subclass}Annotations(subMap: Map[String, String]) extends Serializable { + s"""class __$subclass(subMap: Map[String, String]) extends Serializable { |$attrs |$methods |}""".stripMargin @@ -86,50 +83,33 @@ object AnnotationClassBuilder { val hiddenClass = { val classes = sigs.maps.map { case (subclass, subMap) => - s""" val $subclass = new __${subclass}Annotations(annot.maps("$subclass"))""" + s""" val $subclass = new __$subclass(annot.maps("$subclass"))""" } .mkString("\n") val vals = sigs.vals.map { case (k, sig) => s""" val $k: Option[${sig.emitType}] = annot.getVal("$k").map(_.${sig.emitConversionIdentifier})""" } .mkString("\n") - s"""class ${hiddenClassName}Annotations(annot: org.broadinstitute.hail.annotations.AnnotationData) + s"""class $className(annot: org.broadinstitute.hail.annotations.AnnotationData) | extends Serializable { - | ${if (classes.nonEmpty) classes else "// no classes"} + | ${if (internalClasses.nonEmpty) internalClasses else "// no internal class declarations"} + | ${if (classes.nonEmpty) classes else "// no class instantiations"} | ${if (vals.nonEmpty) vals else "// no vals"} |} |""".stripMargin } s""" - |$internalClasses |$hiddenClass """.stripMargin } - def instantiate(exposedName: String, hiddenClassName: String): String = { - s"val $exposedName = new ${hiddenClassName}Annotations($hiddenClassName)\n" + def instantiate(exposedName: String, className: String, rawName: String): String = { + s"val $exposedName = new $className($rawName)\n" } - def makeIndexedSeq(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = - s"""val $hiddenOutputName: IndexedSeq[${hiddenClassName}Annotations] = - |$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_)) - | + def instantiateIndexedSeq(exposedName: String, classIdentifier: String, rawArrayName: String): String = + s"""val $exposedName: IndexedSeq[$classIdentifier] = + | $rawArrayName.map(new $classIdentifier(_)) """.stripMargin - - val arrayRegex = """Array\[(\w+)\]""".r - val optionRegex = """Option\[(\w+)\]""".r - - private def getDefault(typeStr: String): String = { - if (typeStr == "Int" || typeStr == "Double") - "0" - else if (typeStr == "Boolean") - "false" - else - typeStr match { - case optionRegex(subType) => "None" - case arrayRegex(subType) => getDefault(subType) - case _ => "" - } - } } diff --git a/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala index e097d112a7a..74f117b3900 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala @@ -13,6 +13,7 @@ object VCFSignature { val arrayRegex = """Array\[(\w+)\]""".r val setRegex = """Set\[(\w+)\]""".r + val integerRegex = """(\d+)""".r def parseConversionIdentifier(str: String): String = { str match { @@ -22,19 +23,6 @@ object VCFSignature { } } - def vcfTypeToScala(str: String): String = - str match { - case "Flag" => "Boolean" - case "Integer" => "Int" - case "Float" => "Double" - case "String" => "String" - case "Character" => "Character" - case "." => "String" - case _ => throw new UnsupportedOperationException("unexpected annotation type") - } - - val integerRegex = """(\d+)""".r - def parse(line: VCFInfoHeaderLine): AnnotationSignature = { val vcfType = line.getType.toString val parsedType = line.getType match { diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala index 158bedf23cc..ee62b169616 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterGenotypes.scala @@ -28,10 +28,6 @@ object FilterGenotypes extends Command { def run(state: State, options: Options): State = { val vds = state.vds -// val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures -// val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures -// val ids = vds.sampleIds -// val sa = vds.metadata.sampleAnnotations if (!options.keep && !options.remove) fatal(name + ": one of `--keep' or `--remove' required") diff --git a/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala b/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala index 3e64a432a47..f210ee72877 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/FilterSamples.scala @@ -48,8 +48,7 @@ object FilterSamples extends Command { val cf = new FilterSampleCondition(c, vds.metadata.sampleAnnotationSignatures) cf.typeCheck() - val sampleIdsBc = state.sc.broadcast(vds.sampleIds) - (s: Int, sa: AnnotationData) => cf(Sample(sampleIdsBc.value(s)), vds.metadata.sampleAnnotations(s)) + (s: Int, sa: AnnotationData) => cf(Sample(vds.sampleIds(s)), vds.metadata.sampleAnnotations(s)) } val newVDS = vds.filterSamples(if (options.keep) diff --git a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala index e14b50ddc51..9ace03e2e4d 100644 --- a/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala +++ b/src/main/scala/org/broadinstitute/hail/driver/SampleQC.scala @@ -358,7 +358,9 @@ object SampleQC extends Command { metadata = vds.metadata.copy( sampleAnnotationSignatures = vds.metadata.sampleAnnotationSignatures .addMap("qc", SampleQCCombiner.signatures), - sampleAnnotations = newAnnotations))) + sampleAnnotations = vds.metadata.sampleAnnotations + .zip(newAnnotations) + .map { case (oldAnno, newAnno) => oldAnno ++ newAnno}))) } else { writeTextFile(output + ".header", state.hadoopConf) { s => s.write("sampleID\t") diff --git a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala index a03b79b6504..3912b7ad332 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/ExportTSV.scala @@ -66,8 +66,8 @@ class ExportVariantsEvaluator(list: String, vas: AnnotationSignatures) | import org.broadinstitute.hail.methods.UserExportUtils._ | | val v: ExportVariant = new ExportVariant(__v) - | ${signatures(vas, "__va", makeToString = true)} - | ${instantiate("va", "__va")} + | ${signatures(vas, "__vaClass", makeToString = true)} + | ${instantiate("va", "__vaClass", "__va")} | Array($list).map(toTSVString).mkString("\t") |}: String """.stripMargin @@ -82,8 +82,8 @@ class ExportSamplesEvaluator(list: String, sas: AnnotationSignatures) | import org.broadinstitute.hail.methods.FilterUtils._ | import org.broadinstitute.hail.methods.UserExportUtils._ | - | ${signatures(sas, "__sa", makeToString = true)} - | ${instantiate("sa", "__sa")} + | ${signatures(sas, "__saClass", makeToString = true)} + | ${instantiate("sa", "__saClass", "__sa")} | Array($list).map(toTSVString).mkString("\t") |}: String """.stripMargin) { @@ -104,13 +104,13 @@ class ExportGenotypeEvaluator(list: String, metadata: VariantMetadata) | import org.broadinstitute.hail.methods.FilterUtils._ | import org.broadinstitute.hail.methods.UserExportUtils._ | - | ${signatures(metadata.sampleAnnotationSignatures, "__sa", makeToString = true)} - | ${makeIndexedSeq("__saArray", "__sa", "__sa")} + | ${signatures(metadata.sampleAnnotationSignatures, "__saClass", makeToString = true)} + | ${instantiateIndexedSeq("__saArray", "__saClass", "__sa")} | (__v: org.broadinstitute.hail.variant.Variant, | __va: org.broadinstitute.hail.annotations.AnnotationData) => { | val v = new ExportVariant(__v) - | ${signatures(metadata.variantAnnotationSignatures, "__va")} - | ${instantiate("va", "__va")} + | ${signatures(metadata.variantAnnotationSignatures, "__vaClass")} + | ${instantiate("va", "__vaClass", "__va")} | (__sIndex: Int, | g: org.broadinstitute.hail.variant.Genotype) => { | val sa = __saArray(__sIndex) diff --git a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala index 33ea26c934e..a117668dc52 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/Filter.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/Filter.scala @@ -35,8 +35,8 @@ class FilterVariantCondition(cond: String, vas: AnnotationSignatures) s"""(v: org.broadinstitute.hail.variant.Variant, | __va: org.broadinstitute.hail.annotations.AnnotationData) => { | import org.broadinstitute.hail.methods.FilterUtils._ - | ${signatures(vas, "__va")} - | ${instantiate("va", "__va")} + | ${signatures(vas, "__vaClass")} + | ${instantiate("va", "__vaClass", "__va")} | $cond |}: Boolean """.stripMargin @@ -49,8 +49,8 @@ class FilterSampleCondition(cond: String, sas: AnnotationSignatures) s"""(s: org.broadinstitute.hail.variant.Sample, | __sa: org.broadinstitute.hail.annotations.AnnotationData) => { | import org.broadinstitute.hail.methods.FilterUtils._ - | ${signatures(sas, "__sa")} - | ${instantiate("sa", "__sa")} + | ${signatures(sas, "__saClass")} + | ${instantiate("sa", "__saClass", "__sa")} | $cond |}: Boolean """.stripMargin) { @@ -62,12 +62,12 @@ class FilterGenotypeCondition(cond: String, metadata: VariantMetadata) s"""(__sa: IndexedSeq[org.broadinstitute.hail.annotations.AnnotationData], | __ids: IndexedSeq[String]) => { | import org.broadinstitute.hail.methods.FilterUtils._ - | ${signatures(metadata.sampleAnnotationSignatures, "__sa")} - | ${makeIndexedSeq("__saArray", "__sa", "__sa")} + | ${signatures(metadata.sampleAnnotationSignatures, "__saClass")} + | ${instantiateIndexedSeq("__saArray", "__saClass", "__sa")} | (v: org.broadinstitute.hail.variant.Variant, | __va: org.broadinstitute.hail.annotations.AnnotationData) => { - | ${signatures(metadata.variantAnnotationSignatures, "__va")} - | ${instantiate("va", "__va")} + | ${signatures(metadata.variantAnnotationSignatures, "__vaClass")} + | ${instantiate("va", "__vaClass", "__va")} | (__sIndex: Int, | g: org.broadinstitute.hail.variant.Genotype) => { | val sa = __saArray(__sIndex) diff --git a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala index 9da245e9f64..f09788ebe84 100644 --- a/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala +++ b/src/main/scala/org/broadinstitute/hail/methods/LoadVCF.scala @@ -47,12 +47,12 @@ object LoadVCF { .map(line => (line.getID, VCFSignature.parse(line))) .toMap - val annotationSignatures: AnnotationSignatures = Annotations[AnnotationSignature](Map("info" -> infoSignatures), - Map("filters" -> new SimpleSignature("Set[String]","toSetString", "filters applied to site"), - "pass" -> new SimpleSignature("Boolean", "toBoolean", "filters were applied to vcf and this site passed"), - "multiallelic" -> new SimpleSignature("Boolean", "toBoolean", "Site is a split multiallelic"), - "qual" -> new SimpleSignature("Double", "toDouble", "vcf qual field"), - "rsid" -> new SimpleSignature("String", "toString", "site rsID"))) + val variantAnnotationSignatures: AnnotationSignatures = Annotations[AnnotationSignature](Map("info" -> infoSignatures), + Map("filters" -> new SimpleSignature("Set[String]","toSetString"), + "pass" -> new SimpleSignature("Boolean", "toBoolean"), + "multiallelic" -> new SimpleSignature("Boolean", "toBoolean"), + "qual" -> new SimpleSignature("Double", "toDouble"), + "rsid" -> new SimpleSignature("String", "toString"))) val headerLine = headerLines.last assert(headerLine(0) == '#' && headerLine(1) != '#') @@ -77,6 +77,6 @@ object LoadVCF { VariantSampleMatrix(VariantMetadata(filters, sampleIds, headerLines, Annotations.emptyOfArrayString(sampleIds.length), Annotations.emptyOfSignature(), - annotationSignatures), genotypes) + variantAnnotationSignatures), genotypes) } } diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala index 9b068a727d2..6a208febdf7 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantMetadata.scala @@ -28,9 +28,9 @@ case class VariantMetadata(filters: Seq[(String, String)], def nSamples: Int = sampleIds.length def addSampleAnnotations(sas: AnnotationSignatures, sa: IndexedSeq[AnnotationData]): VariantMetadata = { - this.copy( - sampleAnnotationSignatures = this.sampleAnnotationSignatures ++ sas, - sampleAnnotations = this.sampleAnnotations.zip(sa).map { case (a1, a2) => a1 ++ a2 } + copy( + sampleAnnotationSignatures = sampleAnnotationSignatures ++ sas, + sampleAnnotations = sampleAnnotations.zip(sa).map { case (a1, a2) => a1 ++ a2 } ) } } diff --git a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala index bb997cb850b..2c95bd072bb 100644 --- a/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala +++ b/src/main/scala/org/broadinstitute/hail/variant/VariantSampleMatrix.scala @@ -50,9 +50,9 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, def nLocalSamples: Int = localSamples.length - def copy[U](metadata: VariantMetadata = this.metadata, - localSamples: Array[Int] = this.localSamples, - rdd: RDD[(Variant, AnnotationData, Iterable[U])] = this.rdd) + def copy[U](metadata: VariantMetadata = metadata, + localSamples: Array[Int] = localSamples, + rdd: RDD[(Variant, AnnotationData, Iterable[U])] = rdd) (implicit ttt: TypeTag[U], tct: ClassTag[U]): VariantSampleMatrix[U] = new VariantSampleMatrix(metadata, localSamples, rdd) @@ -144,6 +144,7 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, def filterVariants(ilist: IntervalList): VariantSampleMatrix[T] = filterVariants((v, va) => ilist.contains(v.contig, v.start)) + // see if we can remove broadcasts elsewhere in the code def filterSamples(p: (Int, AnnotationData) => Boolean): VariantSampleMatrix[T] = { val mask = localSamples.zip(metadata.sampleAnnotations).map { case (s, sa) => p(s, sa) } val maskBc = sparkContext.broadcast(mask) @@ -278,7 +279,7 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, val zeroArray = new Array[Byte](zeroBuffer.limit) zeroBuffer.get(zeroArray) - this.copy(rdd = rdd + copy(rdd = rdd .map { case (v, va, gs) => val serializer = SparkEnv.get.serializer.newInstance() val zeroValue = serializer.deserialize[U](ByteBuffer.wrap(zeroArray)) @@ -290,22 +291,22 @@ class VariantSampleMatrix[T](val metadata: VariantMetadata, } def addVariantMapSignatures(mapName: String, map: Map[String, AnnotationSignature]): VariantSampleMatrix[T] = { - this.copy(metadata = metadata.copy(variantAnnotationSignatures = + copy(metadata = metadata.copy(variantAnnotationSignatures = metadata.variantAnnotationSignatures.addMap(mapName, map))) } def addVariantValSignature(name: String, sig: AnnotationSignature): VariantSampleMatrix[T] = { - this.copy(metadata = metadata.copy(variantAnnotationSignatures = + copy(metadata = metadata.copy(variantAnnotationSignatures = metadata.variantAnnotationSignatures.addVal(name, sig))) } def addSampleMapSignatures(mapName: String, map: Map[String, AnnotationSignature]): VariantSampleMatrix[T] = { - this.copy(metadata = metadata.copy(sampleAnnotationSignatures = + copy(metadata = metadata.copy(sampleAnnotationSignatures = metadata.sampleAnnotationSignatures.addMap(mapName, map))) } def addSampleValSignature(name: String, sig: AnnotationSignature): VariantSampleMatrix[T] = { - this.copy(metadata = metadata.copy(sampleAnnotationSignatures = + copy(metadata = metadata.copy(sampleAnnotationSignatures = metadata.sampleAnnotationSignatures.addVal(name, sig))) } } diff --git a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala index 16dca23de63..f1875435d51 100644 --- a/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/annotations/AnnotationsSuite.scala @@ -97,7 +97,7 @@ class AnnotationsSuite extends SparkSuite { .containsInMap("info", "DB")) //type Set[String] - assert(vas.getVal("filters").contains(new SimpleSignature("Set[String]", "toSetString", "filters applied to site"))) + assert(vas.getVal("filters").contains(new SimpleSignature("Set[String]", "toSetString"))) assert(variantAnnotationMap(firstVariant) .getVal("filters").contains("PASS") && variantAnnotationMap(firstVariant) @@ -108,8 +108,7 @@ class AnnotationsSuite extends SparkSuite { .getVal("filters").get.toSetString == Set[String]("VQSRTrancheSNP99.95to100.00")) // GATK PASS - assert(vas.getVal("pass").contains(new SimpleSignature("Boolean", "toBoolean", - "filters were applied to vcf and this site passed"))) + assert(vas.getVal("pass").contains(new SimpleSignature("Boolean", "toBoolean"))) assert(variantAnnotationMap(firstVariant) .getVal("pass").contains("true")) assert(variantAnnotationMap(anotherVariant) diff --git a/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala index 87b2a248bc7..c902b363ba3 100644 --- a/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala @@ -23,11 +23,12 @@ class FilterSuite extends SparkSuite { // the below command will test typing of runtime-generated code exposing annotations FilterGenotypes.run(state, Array("--keep", "-c", - """assert(va.pass.forall(_.getClass.getName == "boolean"), "va.pass was not a boolean") - |assert(va.info.AN.forall(_.getClass.getName == "int"), "AN was not an int") - |assert(va.info.GQ_MEAN.forall(_.getClass.getName == "double"), "GQ_MEAN was not a double") - |assert(va.info.AC.forall(_.getClass.getSimpleName == "int[]"), "AC was not an int array") - |assert(va.filters.forall(_.getClass.getName.contains("scala.collection.immutable.Set")), + """assert(va.pass.forall(_.isInstanceOf[Boolean]), "va.pass was not a boolean") + |assert(va.info.AN.forall(_.isInstanceOf[Int]), "AN was not an int") + |assert(va.info.GQ_MEAN.forall(_.isInstanceOf[Double]), "GQ_MEAN was not a double") + |assert(va.info.AC.forall(_.isInstanceOf[Array[Int]]) && va.info.AC.forall(_.forall(_.isInstanceOf[Int])), + | "AC was not an int array") + |assert(va.filters.forall(_.isInstanceOf[Set[String]]) && va.filters.forall(_.forall(_.isInstanceOf[String])), | "filters was not a set") |true""".stripMargin)).vds.expand().collect() } diff --git a/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala b/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala index 64dc35017f4..dbd5ed91e4d 100644 --- a/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/variant/vsm/VSMSuite.scala @@ -19,6 +19,12 @@ class VSMSuite extends SparkSuite { val mdata1 = VariantMetadata(Array("S1", "S2", "S3")) val mdata2 = VariantMetadata(Array("S1", "S2")) + val mdata3 = new VariantMetadata(Seq.empty[(String, String)], Array("S1", "S2"), None, + Annotations.emptyOfArrayString(2).map(_.addVal("1", "5")), Annotations.emptyOfSignature(), + Annotations.emptyOfSignature()) + val mdata4 = new VariantMetadata(Seq.empty[(String, String)], Array("S1", "S2"), None, + Annotations.emptyOfArrayString(2), Annotations.emptyOfSignature(), Annotations.emptyOfSignature() + .addMap("dummy", Map.empty[String, AnnotationSignature])) assert(mdata1 != mdata2) @@ -28,7 +34,7 @@ class VSMSuite extends SparkSuite { val va1 = Annotations(Map("info" -> Map("v1thing" -> "yes")), Map("v1otherThing" -> "yes")) val va2 = Annotations(Map("info" -> Map("v2thing" -> "yes")), Map("v2otherThing" -> "yes")) - val va3 = Annotations(Map("info" -> Map("v3thing" -> "yes")), Map("v3otherThing" -> "yes")) + val va3 = Annotations(Map("info" -> Map("v1thing" -> "no")), Map("v1otherThing" -> "no")) val rdd1 = sc.parallelize(Seq((v1, va1, Iterable(Genotype(-1, (0, 2), 2, null), @@ -90,6 +96,10 @@ class VSMSuite extends SparkSuite { new VariantDataset(mdata2, rdd3), new VariantDataset(mdata2, rdd4), new VariantDataset(mdata2, rdd5), + new VariantDataset(mdata3, rdd1), + new VariantDataset(mdata3, rdd2), + new VariantDataset(mdata4, rdd1), + new VariantDataset(mdata4, rdd2), new VariantDataset(mdata1, rdd6)) for (i <- vdss.indices; From 5d2ec4eaa1c872adfc6e95dbf887d1a51a89955a Mon Sep 17 00:00:00 2001 From: tpoterba Date: Mon, 21 Dec 2015 16:14:11 -0500 Subject: [PATCH 14/15] Fourth round of comments from cseed integrated --- .../hail/annotations/Annotations.scala | 12 +----------- .../hail/vcf/HtsjdkRecordReader.scala | 14 ++++++++++---- .../broadinstitute/hail/methods/FilterSuite.scala | 2 +- .../broadinstitute/hail/utils/TestRDDBuilder.scala | 2 +- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala index cdc46c6317b..7e2e3455e5a 100644 --- a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala +++ b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala @@ -37,22 +37,12 @@ object Annotations { def emptyOfSignature(): AnnotationSignatures = empty[AnnotationSignature]() - def emptyOfString(): AnnotationData = empty[String]() + def emptyOfData(): AnnotationData = empty[String]() def emptyOfArrayString(nSamples: Int): IndexedSeq[AnnotationData] = IndexedSeq.fill[Annotations[String]](nSamples)(empty[String]()) } -object AnnotationUtils { - - def annotationToString(ar: AnyRef): String = { - ar match { - case iter: Iterable[_] => if (iter.isEmpty) "" else iter.map(_.toString).mkString(", ") - case _ => ar.toString - } - } -} - object AnnotationClassBuilder { def signatures(sigs: AnnotationSignatures, className: String, diff --git a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala index 5b223cadd6a..d9dcb2a73fd 100644 --- a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala +++ b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala @@ -3,8 +3,6 @@ package org.broadinstitute.hail.vcf import htsjdk.variant.variantcontext.Allele import org.broadinstitute.hail.variant._ import org.broadinstitute.hail.annotations._ -import org.broadinstitute.hail.annotations.AnnotationUtils.annotationToString - import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer @@ -21,6 +19,14 @@ class BufferedLineIterator(bit: BufferedIterator[String]) extends htsjdk.tribble } class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializable { + + def infoToString(ar: AnyRef): String = { + ar match { + case iter: Iterable[_] => if (iter.isEmpty) "" else iter.map(_.toString).mkString(", ") + case _ => ar.toString + } + } + def readRecord(line: String): Iterator[(Variant, AnnotationData, Iterator[Genotype])] = { val vc = codec.decode(line) //maybe count tabs to get filter field @@ -41,7 +47,7 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl vc.getAlternateAllele(0).getBaseString) Iterator.single((variant, Annotations[String](Map[String, Map[String, String]]("info" -> vc.getAttributes .asScala - .mapValues(annotationToString) + .mapValues(infoToString) .toMap), Map[String, String]( "qual" -> vc.getPhredScaledQual.toString, @@ -93,7 +99,7 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl (Variant(vc.getContig, vc.getStart, ref.getBaseString, alt.getBaseString, wasSplit = true), Annotations[String](Map[String, Map[String, String]]("info" -> vc.getAttributes .asScala - .mapValues(annotationToString) + .mapValues(infoToString) .toMap), Map[String, String]( "qual" -> vc.getPhredScaledQual.toString, diff --git a/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala b/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala index c902b363ba3..b00bec3278c 100644 --- a/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala +++ b/src/test/scala/org/broadinstitute/hail/methods/FilterSuite.scala @@ -28,7 +28,7 @@ class FilterSuite extends SparkSuite { |assert(va.info.GQ_MEAN.forall(_.isInstanceOf[Double]), "GQ_MEAN was not a double") |assert(va.info.AC.forall(_.isInstanceOf[Array[Int]]) && va.info.AC.forall(_.forall(_.isInstanceOf[Int])), | "AC was not an int array") - |assert(va.filters.forall(_.isInstanceOf[Set[String]]) && va.filters.forall(_.forall(_.isInstanceOf[String])), + |assert(va.filters.forall(_.isInstanceOf[Set[_]]) && va.filters.forall(_.forall(_.isInstanceOf[String])), | "filters was not a set") |true""".stripMargin)).vds.expand().collect() } diff --git a/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala b/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala index 9b0c6749998..ffd8bd983b3 100644 --- a/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala +++ b/src/test/scala/org/broadinstitute/hail/utils/TestRDDBuilder.scala @@ -119,7 +119,7 @@ object TestRDDBuilder { b += Genotype(gt, ad, dp, pl) } - (variant, Annotations.emptyOfString(), b.result(): Iterable[Genotype]) + (variant, Annotations.emptyOfData(), b.result(): Iterable[Genotype]) } VariantSampleMatrix(VariantMetadata(sampleList), streamRDD) } From 1c601fb25fac070405f6f0edd0db3d176805c69b Mon Sep 17 00:00:00 2001 From: tpoterba Date: Mon, 21 Dec 2015 16:15:28 -0500 Subject: [PATCH 15/15] Fourth round of comments from cseed integrated --- .../hail/vcf/HtsjdkRecordReader.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala index d9dcb2a73fd..e78e1575b76 100644 --- a/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala +++ b/src/main/scala/org/broadinstitute/hail/vcf/HtsjdkRecordReader.scala @@ -20,13 +20,6 @@ class BufferedLineIterator(bit: BufferedIterator[String]) extends htsjdk.tribble class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializable { - def infoToString(ar: AnyRef): String = { - ar match { - case iter: Iterable[_] => if (iter.isEmpty) "" else iter.map(_.toString).mkString(", ") - case _ => ar.toString - } - } - def readRecord(line: String): Iterator[(Variant, AnnotationData, Iterator[Genotype])] = { val vc = codec.decode(line) //maybe count tabs to get filter field @@ -47,7 +40,7 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl vc.getAlternateAllele(0).getBaseString) Iterator.single((variant, Annotations[String](Map[String, Map[String, String]]("info" -> vc.getAttributes .asScala - .mapValues(infoToString) + .mapValues(HtsjdkRecordReader.infoToString) .toMap), Map[String, String]( "qual" -> vc.getPhredScaledQual.toString, @@ -99,7 +92,7 @@ class HtsjdkRecordReader(codec: htsjdk.variant.vcf.VCFCodec) extends Serializabl (Variant(vc.getContig, vc.getStart, ref.getBaseString, alt.getBaseString, wasSplit = true), Annotations[String](Map[String, Map[String, String]]("info" -> vc.getAttributes .asScala - .mapValues(infoToString) + .mapValues(HtsjdkRecordReader.infoToString) .toMap), Map[String, String]( "qual" -> vc.getPhredScaledQual.toString, @@ -167,4 +160,11 @@ object HtsjdkRecordReader { codec.readHeader(new BufferedLineIterator(headerLines.iterator.buffered)) new HtsjdkRecordReader(codec) } + + def infoToString(ar: AnyRef): String = { + ar match { + case iter: Iterable[_] => if (iter.isEmpty) "" else iter.map(_.toString).mkString(", ") + case _ => ar.toString + } + } }