-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Tp info #97
Tp info #97
Changes from 5 commits
b72af87
ccee7d8
99db742
da29bfa
59d8c08
ec09ddf
d58ba41
a0c46fe
1b06044
f3ff63d
7d7026a
d4a3acd
866b435
6446687
1405f80
205a4a7
5d2ec4e
1c601fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package org.broadinstitute.hail.annotations | ||
|
||
abstract class AnnotationSignature { | ||
def buildCaseClasses: String | ||
def conversion: String | ||
def getType: String | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
package org.broadinstitute.hail.annotations | ||
|
||
case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T]) extends Serializable { | ||
|
||
def nAttrs: Int = { | ||
var i = 0 | ||
maps.foreach { | ||
case (id, m) => | ||
i += m.size | ||
} | ||
i += vals.size | ||
i | ||
} | ||
|
||
def hasMap(str: String): Boolean = maps.contains(str) | ||
|
||
def contains(str: String): Boolean = vals.contains(str.toLowerCase) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't change case! |
||
|
||
def contains(parent: String, str: String): Boolean = hasMap(parent) && maps(parent).contains(str) | ||
|
||
def get(str: String): Option[T] = vals.get(str) | ||
|
||
def get(parent: String, str: String): Option[T] = { | ||
if (!hasMap(parent)) | ||
None | ||
else | ||
maps(parent).get(str) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The body of this can just be |
||
|
||
def getOrElse(parent: String, str: String, default: T): T = { | ||
if (!hasMap(parent) || !contains(parent, str)) | ||
default | ||
else | ||
maps(parent)(str) | ||
} | ||
|
||
def getOrElse(str: String, default: T): T = { | ||
if (!contains(str)) | ||
default | ||
else | ||
vals(str) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think contains, get, getOrElse all need to make it clear if they're accessing maps or vals, e.g., getValOrElse. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deleted for now -- they aren't actually being used anywhere. I think the longer-term design is to let |
||
|
||
def addMap(name: String, m: Map[String, T]): Annotations[T] = { | ||
Annotations(maps | ||
.-(name) | ||
.+((name, m)), vals) | ||
} | ||
|
||
def addMaps(newMaps: Map[String, Map[String, T]]): Annotations[T] = { | ||
Annotations(maps | ||
.--(newMaps.keys) | ||
.++(newMaps), vals) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why call the operators as functions? Why not just
|
||
|
||
def addVal(name: String, mapping: T): Annotations[T] = { | ||
Annotations(maps, vals | ||
.-(name) | ||
.+((name, mapping))) | ||
} | ||
|
||
def addVals(newVals: Map[String, T]): Annotations[T] = { | ||
Annotations(maps, vals | ||
.--(newVals.keys) | ||
.++(newVals)) | ||
} | ||
} | ||
|
||
object EmptyAnnotationSignatures { | ||
def apply(): AnnotationSignatures = { | ||
Annotations(Map.empty[String, Map[String, AnnotationSignature]], Map.empty[String, AnnotationSignature]) | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to have AnnotationSignatures.emptyOfString. |
||
|
||
object EmptyAnnotations { | ||
def apply(): AnnotationData = { | ||
Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should call |
||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same. |
||
|
||
object EmptySampleAnnotations { | ||
def apply(nSamples: Int): IndexedSeq[AnnotationData] = { | ||
(0 until nSamples) | ||
.map(i => Annotations(Map.empty[String, Map[String, String]], Map.empty[String, String])) | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
|
||
object AnnotationUtils { | ||
|
||
def annotationToString(ar: AnyRef): String = { | ||
ar match { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to duplicate the code in ConvertibleString. They should get unified. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only used by |
||
case iter: Iterable[_] => if (iter.isEmpty) "" else iter.map(_.toString).reduceRight(_ + ", " + _) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. iter.mkString(",") |
||
case _ => ar.toString | ||
} | ||
} | ||
|
||
def parseAnnotationType(str: String): String = { | ||
str match { | ||
case "Flag" => "Boolean" | ||
case "Integer" => "Int" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really hard to tell what's the input and what's the output here. I think you should have a |
||
case "Float" => "Double" | ||
case "String" => "String" | ||
case _ => throw new UnsupportedOperationException("unexpected annotation type") | ||
} | ||
} | ||
} | ||
|
||
object AnnotationClassBuilder { | ||
|
||
def signatures(sigs: AnnotationSignatures, hiddenClassName: String, | ||
makeToString: Boolean = false, missing: String = ""): String = { | ||
val internalClasses = sigs.maps.map { | ||
case (subclass, subMap) => | ||
s"class __${subclass}Annotations(subMap: Map[String, String]) extends Serializable {\n" + | ||
subMap.map { case (k, sig) => | ||
// s""" val $k: $kType = subMap.getOrElse("$k", \"false\").$kMethod\n""" | ||
val default = getDefault(sig.getType) | ||
s""" val $k: ${sig.getType} = subMap.getOrElse("$k", "$default").${sig.conversion}\n""" | ||
} | ||
.foldRight[String]("")(_ + _) + { | ||
if (makeToString) { | ||
val keys = subMap.keys.toArray.sorted | ||
" def __fields: Array[String] = Array(" + { | ||
if (keys.isEmpty) "" | ||
else keys.map(_ + s""".formatString("$missing")""") | ||
.reduceRight(_ + "," + _) | ||
} + ")\n" + | ||
""" override def toString: String = """ + | ||
"""if (__fields.length == 0) "" else __fields.reduceRight(_ + ";" + _)""" + "\n" + | ||
""" def all: String = if (__fields.length == 0) "" else __fields.reduceRight(_ + "\t" + _)""" + "\n" | ||
} | ||
else "" | ||
} + | ||
"}\n" | ||
} | ||
.foldRight[String]("")(_ + _) | ||
|
||
val hiddenClass = s"class ${hiddenClassName}Annotations" + | ||
s"(annot: org.broadinstitute.hail.annotations.AnnotationData) extends Serializable {\n" + | ||
sigs.maps.map { case (subclass, subMap) => | ||
s""" val $subclass = new __${subclass}Annotations(annot.maps(\"$subclass\"))\n""" | ||
} | ||
.foldRight[String]("")(_ + _) + | ||
sigs.vals.map { case (k, sig) => | ||
val default = getDefault(sig.getType) | ||
s""" val $k: ${sig.getType} = annot.vals.getOrElse("$k", "$default").${sig.conversion} \n""" | ||
} | ||
.foldRight[String]("")(_ + _) + "}\n" | ||
|
||
"\n" + internalClasses + hiddenClass | ||
} | ||
|
||
def instantiate(exposedName: String, hiddenClassName: String): String = { | ||
s"val $exposedName = new ${hiddenClassName}Annotations($hiddenClassName)\n" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is |
||
} | ||
|
||
def makeIndexedSeq(hiddenOutputName: String, hiddenClassName: String, hiddenAnnotationArrayName: String): String = { | ||
s"val $hiddenOutputName: IndexedSeq[${hiddenClassName}Annotations] = " + | ||
s"$hiddenAnnotationArrayName.map(new ${hiddenClassName}Annotations(_))\n" | ||
} | ||
|
||
val arrayRegex = """Array\[(\w+)\]""".r | ||
val optionRegex = """Option\[(\w+)\]""".r | ||
|
||
private def getDefault(typeStr: String): String = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unused. |
||
if (typeStr == "Int" || typeStr == "Double") | ||
"0" | ||
else if (typeStr == "Boolean") | ||
"false" | ||
else | ||
typeStr match { | ||
case optionRegex(subType) => "None" | ||
case arrayRegex(subType) => getDefault(subType) | ||
case _ => "" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two points of confusion: why is the default type for an array just the type of the element? This gets used when an info field is missing. Why aren't you using options for everything? Is this waiting on Jon's stuff? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also waiting on Jon's stuff |
||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package org.broadinstitute.hail.annotations | ||
|
||
case class SimpleSignature(scalaType: String, conversionMethod: String, description: String) | ||
extends AnnotationSignature { | ||
|
||
def buildCaseClasses: String = "" | ||
|
||
def conversion: String = conversionMethod | ||
|
||
def getType: String = scalaType | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can implement a def with a value. So I think you can write something like |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package org.broadinstitute.hail | ||
|
||
package object annotations { | ||
type AnnotationSignatures = Annotations[AnnotationSignature] | ||
type AnnotationData = Annotations[String] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
package org.broadinstitute.hail.driver | ||
|
||
import org.broadinstitute.hail.Utils._ | ||
import org.broadinstitute.hail.methods._ | ||
import org.broadinstitute.hail.variant._ | ||
import org.broadinstitute.hail.annotations._ | ||
import org.kohsuke.args4j.{Option => Args4jOption} | ||
|
||
object ExportGenotypes extends Command { | ||
|
||
class Options extends BaseOptions { | ||
|
||
@Args4jOption(required = true, name = "-o", aliases = Array("--output"), | ||
usage = "path of output tsv") | ||
var output: String = _ | ||
|
||
@Args4jOption(required = true, name = "-c", aliases = Array("--condition"), | ||
usage = "Comma-separated list of fields to be printed to tsv") | ||
var condition: String = _ | ||
|
||
@Args4jOption(required = false, name = "--missing", | ||
usage = "Format of missing values (Default: 'NA')") | ||
var missing = "NA" | ||
} | ||
|
||
def newOptions = new Options | ||
|
||
def name = "exportgenotypes" | ||
|
||
def description = "Export list of sample-variant information to tsv" | ||
|
||
def run(state: State, options: Options): State = { | ||
val vds = state.vds | ||
|
||
val cond = options.condition | ||
|
||
val output = options.output | ||
|
||
val vas: AnnotationSignatures = state.vds.metadata.variantAnnotationSignatures | ||
val sas: AnnotationSignatures = state.vds.metadata.sampleAnnotationSignatures | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you're going to put |
||
val sa = state.vds.metadata.sampleAnnotations | ||
|
||
val makeString: IndexedSeq[AnnotationData] => ((Variant, AnnotationData) => | ||
((Int, Sample, Genotype) => String)) = try { | ||
val cf = new ExportGenotypeEvaluator(options.condition, vas, sas, sa, options.missing) | ||
cf.typeCheck() | ||
cf.apply | ||
} | ||
catch { | ||
case e: scala.tools.reflect.ToolBoxError => | ||
/* e.message looks like: | ||
reflective compilation has failed: | ||
|
||
';' expected but '.' found. */ | ||
fatal("parse error in condition: " + e.message.split("\n").last) | ||
} | ||
|
||
val sampleIdsBc = state.sc.broadcast(state.vds.sampleIds) | ||
|
||
val stringVDS = vds.mapValuesWithAll((v: Variant, va: AnnotationData, s: Int, g: Genotype) => | ||
makeString(sa)(v, va)(s, Sample(sampleIdsBc.value(s)), g)) | ||
|
||
// FIXME add additional command parsing functionality | ||
val variantRegex = """v\.(\w+)""".r | ||
val sampleRegex = """s\.(\w+)""".r | ||
val topLevelSampleAnnoRegex = """sa\.(\w+)""".r | ||
val topLevelVariantAnnoRegex = """va\.(\w+)""".r | ||
val samplePrintMapRegex = """sa\.(\w+)\.all""".r | ||
val variantPrintMapRegex = """va\.(\w+)\.all""".r | ||
val annoRegex = """\wa\.(.+)""".r | ||
def mapColumnNames(input: String): String = { | ||
input match { | ||
case "v" => "Variant" | ||
case "s" => "Sample" | ||
case "va" => | ||
fatal("parse error in condition: cannot print 'va', choose a group or value in annotations") | ||
case "sa" => | ||
fatal("parse error in condition: cannot print 'sa', choose a group or value in annotations") | ||
case variantRegex(x) => x | ||
case sampleRegex(x) => x | ||
case topLevelSampleAnnoRegex(x) => | ||
if (sas.maps.contains(x)) { | ||
val keys = sas.maps(x).keys.toArray.sorted | ||
if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) | ||
} | ||
else x | ||
case topLevelVariantAnnoRegex(x) => | ||
if (vas.maps.contains(x)) { | ||
val keys = vas.maps(x).keys.toArray.sorted | ||
if (keys.isEmpty) x else s"$x:" + keys.reduceRight(_ + ";" + _) | ||
} | ||
else x | ||
case samplePrintMapRegex(x) => | ||
val keys = sas.maps(x).keys | ||
if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) | ||
case variantPrintMapRegex(x) => | ||
val keys = vas.maps(x).keys | ||
if (keys.isEmpty) x else keys.reduceRight(_ + "\t" + _) | ||
case annoRegex(x) => x | ||
case _ => input | ||
} | ||
} | ||
|
||
writeTextFile(output + ".header", state.hadoopConf) { s => | ||
s.write(cond.split(",").map(_.split("\\.").last).reduceRight(_ + "\t" + _)) | ||
s.write("\n") | ||
} | ||
|
||
hadoopDelete(output, state.hadoopConf, recursive = true) | ||
|
||
stringVDS.rdd | ||
.flatMap { case (v, va, strings) => strings} | ||
.saveAsTextFile(output) | ||
|
||
state | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Too imperative.
maps.values.map(_.size).sum + vals.size