Skip to content

Commit

Permalink
Reference Genome #3 (hail-is#2086)
Browse files Browse the repository at this point in the history
* Reference Genome #3

- Added genome reference to TVariant, TInterval, TLocus

* fixed tvariant sig match

* fixed type unify error

* fixed rebase

* fixed hailrep for variant

* fixed hailrep for locus and interval

* fix tests

* add GRVariable

* Attempt #3

* Changed vSig for GDB

* fixup

* addressed comments
  • Loading branch information
jigold authored and jbloom22 committed Oct 31, 2017
1 parent 359c0df commit 29369c7
Show file tree
Hide file tree
Showing 29 changed files with 304 additions and 167 deletions.
41 changes: 31 additions & 10 deletions python/hail/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ def _from_java(cls, jtype):
return TDict._from_java(jtype)
elif class_name == 'is.hail.expr.TStruct':
return TStruct._from_java(jtype)
elif class_name == 'is.hail.expr.TVariant':
return TVariant._from_java(jtype)
elif class_name == 'is.hail.expr.TLocus':
return TLocus._from_java(jtype)
elif class_name == 'is.hail.expr.TInterval':
return TInterval._from_java(jtype)
else:
raise TypeError("unknown type class: '%s'" % class_name)

Expand Down Expand Up @@ -563,10 +569,16 @@ class TVariant(Type):
- in Python, values are instances of :class:`hail.representation.Variant`
"""
__metaclass__ = SingletonType

def __init__(self):
super(TVariant, self).__init__(scala_object(Env.hail().expr, 'TVariant'))
jtype = scala_object(Env.hail().expr, 'TVariant').apply(Env.hail().variant.GenomeReference.GRCh37())
super(TVariant, self).__init__(jtype)

@classmethod
def _from_java(cls, jtype):
v = TVariant.__new__(cls)
v._jtype = jtype
return v

def _convert_to_py(self, annotation):
if annotation:
Expand Down Expand Up @@ -707,10 +719,16 @@ class TLocus(Type):
- in Python, values are instances of :class:`hail.representation.Locus`
"""
__metaclass__ = SingletonType

def __init__(self):
super(TLocus, self).__init__(scala_object(Env.hail().expr, 'TLocus'))
jtype = scala_object(Env.hail().expr, 'TLocus').apply(Env.hail().variant.GenomeReference.GRCh37())
super(TLocus, self).__init__(jtype)

@classmethod
def _from_java(cls, jtype):
l = TLocus.__new__(cls)
l._jtype = jtype
return l

def _convert_to_py(self, annotation):
if annotation:
Expand Down Expand Up @@ -743,10 +761,16 @@ class TInterval(Type):
- in Python, values are instances of :class:`hail.representation.Interval`
"""
__metaclass__ = SingletonType

def __init__(self):
super(TInterval, self).__init__(scala_object(Env.hail().expr, 'TInterval'))
jtype = scala_object(Env.hail().expr, 'TInterval').apply(Env.hail().variant.GenomeReference.GRCh37())
super(TInterval, self).__init__(jtype)

@classmethod
def _from_java(cls, jtype):
i = TInterval.__new__(cls)
i._jtype = jtype
return i

def _convert_to_py(self, annotation):
if annotation:
Expand Down Expand Up @@ -775,12 +799,9 @@ def __repr__(self):
'is.hail.expr.TFloat64$': TFloat64,
'is.hail.expr.TBoolean$': TBoolean,
'is.hail.expr.TString$': TString,
'is.hail.expr.TVariant$': TVariant,
'is.hail.expr.TAltAllele$': TAltAllele,
'is.hail.expr.TLocus$': TLocus,
'is.hail.expr.TGenotype$': TGenotype,
'is.hail.expr.TCall$': TCall,
'is.hail.expr.TInterval$': TInterval}
'is.hail.expr.TCall$': TCall}

import pprint

Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/is/hail/HailContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import is.hail.io.vcf._
import is.hail.keytable.KeyTable
import is.hail.stats.{BaldingNicholsModel, Distribution, UniformDist}
import is.hail.utils.{log, _}
import is.hail.variant.{GenericDataset, Genotype, Locus, VSMFileMetadata, VSMSubgen, Variant, VariantDataset, VariantSampleMatrix}
import is.hail.variant.{GenericDataset, GenomeReference, Genotype, Locus, VSMFileMetadata, VSMSubgen, Variant, VariantDataset, VariantSampleMatrix}
import org.apache.hadoop
import org.apache.log4j.{ConsoleAppender, LogManager, PatternLayout, PropertyConfigurator}
import org.apache.spark.deploy.SparkHadoopUtil
Expand Down Expand Up @@ -296,7 +296,7 @@ class HailContext private(val sc: SparkContext,

val signature = TStruct("rsid" -> TString, "varid" -> TString)

val rdd = sc.union(results.map(_.rdd)).toOrderedRDD(TVariant.orderedKey, classTag[(Annotation, Iterable[Annotation])])
val rdd = sc.union(results.map(_.rdd)).toOrderedRDD(TVariant(GenomeReference.GRCh37).orderedKey, classTag[(Annotation, Iterable[Annotation])])

new GenericDataset(this,
VSMFileMetadata(samples,
Expand Down Expand Up @@ -575,7 +575,7 @@ class HailContext private(val sc: SparkContext,

def eval(expr: String): (Annotation, Type) = {
val ec = EvalContext(
"v" -> TVariant,
"v" -> TVariant(GenomeReference.GRCh37),
"s" -> TString,
"g" -> TGenotype,
"sa" -> TStruct(
Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/is/hail/annotations/Annotation.scala
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ object Annotation {
null
else
t match {
case TVariant => a.asInstanceOf[Variant].toRow
case _: TVariant => a.asInstanceOf[Variant].toRow
case TGenotype => Genotype.toRow(a.asInstanceOf[Genotype])
case TLocus => a.asInstanceOf[Locus].toRow
case _: TLocus => a.asInstanceOf[Locus].toRow

case TArray(elementType) =>
a.asInstanceOf[IndexedSeq[_]].map(expandAnnotation(_, elementType))
Expand All @@ -79,7 +79,7 @@ object Annotation {

case TAltAllele => a.asInstanceOf[AltAllele].toRow

case TInterval =>
case _: TInterval =>
val i = a.asInstanceOf[Interval[Locus]]
Annotation(i.start.toRow,
i.end.toRow)
Expand Down
10 changes: 5 additions & 5 deletions src/main/scala/is/hail/annotations/MemoryBlock.scala
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ class RegionValueBuilder(region: MemoryBuffer) {
}
endArray()

case TVariant =>
case t: TVariant =>
val v = a.asInstanceOf[Variant]
startStruct()
addString(v.contig)
Expand Down Expand Up @@ -652,18 +652,18 @@ class RegionValueBuilder(region: MemoryBuffer) {
addBoolean(g._isLinearScale)
endStruct()

case TLocus =>
case t: TLocus =>
val l = a.asInstanceOf[Locus]
startStruct()
addString(l.contig)
addInt(l.position)
endStruct()

case TInterval =>
case t: TInterval =>
val i = a.asInstanceOf[Interval[Locus]]
startStruct()
addAnnotation(TLocus, i.start)
addAnnotation(TLocus, i.end)
addAnnotation(TLocus(t.gr), i.start)
addAnnotation(TLocus(t.gr), i.end)
endStruct()
}
}
Expand Down
14 changes: 7 additions & 7 deletions src/main/scala/is/hail/annotations/UnsafeRow.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import com.esotericsoftware.kryo.io.{Input, Output}
import com.esotericsoftware.kryo.{Kryo, KryoSerializable}
import is.hail.expr._
import is.hail.utils.Interval
import is.hail.variant.{AltAllele, GenericGenotype, Locus, Variant}
import is.hail.variant.{AltAllele, GenericGenotype, GenomeReference, Locus, Variant}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.Row

Expand Down Expand Up @@ -76,7 +76,7 @@ object UnsafeRow {
new String(readBinary(region, offset))

def readLocus(region: MemoryBuffer, offset: Long): Locus = {
val ft = TLocus.fundamentalType.asInstanceOf[TStruct]
val ft = TLocus(GenomeReference.GRCh37).fundamentalType.asInstanceOf[TStruct]
Locus(
readString(region, offset + ft.byteOffsets(0)),
region.loadInt(offset + ft.byteOffsets(1)))
Expand Down Expand Up @@ -146,17 +146,17 @@ object UnsafeRow {
case struct: TStruct =>
readStruct(region, offset, ttBc)

case TVariant =>
val ft = TVariant.fundamentalType.asInstanceOf[TStruct]
case x: TVariant =>
val ft = x.fundamentalType.asInstanceOf[TStruct]
Variant(
readString(region, offset + ft.byteOffsets(0)),
region.loadInt(offset + ft.byteOffsets(1)),
readString(region, offset + ft.byteOffsets(2)),
readArrayAltAllele(region, offset + ft.byteOffsets(3)))
case TLocus => readLocus(region, offset)
case x: TLocus => readLocus(region, offset)
case TAltAllele => readAltAllele(region, offset)
case TInterval =>
val ft = TInterval.fundamentalType.asInstanceOf[TStruct]
case x: TInterval =>
val ft = x.fundamentalType.asInstanceOf[TStruct]
Interval[Locus](
readLocus(region, offset + ft.byteOffsets(0)),
readLocus(region, offset + ft.byteOffsets(1)))
Expand Down
44 changes: 22 additions & 22 deletions src/main/scala/is/hail/expr/AnnotationImpex.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ object SparkAnnotationImpex extends AnnotationImpex[DataType, Any] {

def requiresConversion(t: Type): Boolean = t match {
case TArray(elementType) => requiresConversion(elementType)
case TSet(_) | TDict(_, _) | TGenotype | TAltAllele | TVariant | TLocus | TInterval => true
case TSet(_) | TDict(_, _) | TGenotype | TAltAllele | TVariant(_) | TLocus(_) | TInterval(_) => true
case TStruct(fields) =>
fields.isEmpty || fields.exists(f => requiresConversion(f.typ))
case _ => false
Expand Down Expand Up @@ -87,17 +87,17 @@ object SparkAnnotationImpex extends AnnotationImpex[DataType, Any] {
case TAltAllele =>
val r = a.asInstanceOf[Row]
AltAllele(r.getAs[String](0), r.getAs[String](1))
case TVariant =>
case _: TVariant =>
val r = a.asInstanceOf[Row]
Variant(r.getAs[String](0), r.getAs[Int](1), r.getAs[String](2),
r.getAs[Seq[Row]](3).map(aa =>
importAnnotation(aa, TAltAllele).asInstanceOf[AltAllele]).toArray)
case TLocus =>
case _: TLocus =>
val r = a.asInstanceOf[Row]
Locus(r.getAs[String](0), r.getAs[Int](1))
case TInterval =>
case x: TInterval =>
val r = a.asInstanceOf[Row]
Interval(importAnnotation(r.get(0), TLocus).asInstanceOf[Locus], importAnnotation(r.get(1), TLocus).asInstanceOf[Locus])
Interval(importAnnotation(r.get(0), TLocus(x.gr)).asInstanceOf[Locus], importAnnotation(r.get(1), TLocus(x.gr)).asInstanceOf[Locus])
case TStruct(fields) =>
if (fields.isEmpty)
if (a.asInstanceOf[Boolean]) Annotation.empty else null
Expand Down Expand Up @@ -126,9 +126,9 @@ object SparkAnnotationImpex extends AnnotationImpex[DataType, Any] {
StructField("key", keyType.schema),
StructField("value", valueType.schema))))
case TAltAllele => AltAllele.sparkSchema
case TVariant => Variant.sparkSchema
case TLocus => Locus.sparkSchema
case TInterval => StructType(Array(
case _: TVariant => Variant.sparkSchema
case _: TLocus => Locus.sparkSchema
case _: TInterval => StructType(Array(
StructField("start", Locus.sparkSchema, nullable = false),
StructField("end", Locus.sparkSchema, nullable = false)))
case TGenotype => Genotype.sparkSchema
Expand Down Expand Up @@ -168,15 +168,15 @@ object SparkAnnotationImpex extends AnnotationImpex[DataType, Any] {
case TAltAllele =>
val aa = a.asInstanceOf[AltAllele]
Row(aa.ref, aa.alt)
case TVariant =>
case TVariant(gr) =>
val v = a.asInstanceOf[Variant]
Row(v.contig, v.start, v.ref, v.altAlleles.map(aa => Row(aa.ref, aa.alt)))
case TLocus =>
case TLocus(gr) =>
val l = a.asInstanceOf[Locus]
Row(l.contig, l.position)
case TInterval =>
case TInterval(gr) =>
val i = a.asInstanceOf[Interval[_]]
Row(exportAnnotation(i.start, TLocus), exportAnnotation(i.end, TLocus))
Row(exportAnnotation(i.start, TLocus(gr)), exportAnnotation(i.end, TLocus(gr)))
case TStruct(fields) =>
if (fields.isEmpty)
a != null
Expand Down Expand Up @@ -300,9 +300,9 @@ object JSONAnnotationImpex extends AnnotationImpex[Type, JValue] {
case TCall => JInt(a.asInstanceOf[Int])
case TGenotype => Genotype.toJSON(a.asInstanceOf[Genotype])
case TAltAllele => a.asInstanceOf[AltAllele].toJSON
case TVariant => a.asInstanceOf[Variant].toJSON
case TLocus => a.asInstanceOf[Locus].toJSON
case TInterval => a.asInstanceOf[Interval[Locus]].toJSON(TLocus.toJSON(_))
case TVariant(_) => a.asInstanceOf[Variant].toJSON
case TLocus(_) => a.asInstanceOf[Locus].toJSON
case TInterval(gr) => a.asInstanceOf[Interval[Locus]].toJSON(TLocus(gr).toJSON(_))
case TStruct(fields) =>
val row = a.asInstanceOf[Row]
JObject(fields
Expand Down Expand Up @@ -384,11 +384,11 @@ object JSONAnnotationImpex extends AnnotationImpex[Type, JValue] {
}
case (_, TAltAllele) =>
jv.extract[AltAllele]
case (_, TVariant) =>
case (_, TVariant(_)) =>
jv.extract[JSONExtractVariant].toVariant
case (_, TLocus) =>
case (_, TLocus(_)) =>
jv.extract[Locus]
case (_, TInterval) =>
case (_, TInterval(_)) =>
jv.extract[JSONExtractInterval].toInterval
case (_, TGenotype) =>
jv.extract[JSONExtractGenotype].toGenotype
Expand Down Expand Up @@ -425,7 +425,7 @@ object TableAnnotationImpex extends AnnotationImpex[Unit, String] {
case it: TIterable => JsonMethods.compact(it.toJSON(a))
case t: TStruct => JsonMethods.compact(t.toJSON(a))
case TGenotype => JsonMethods.compact(t.toJSON(a))
case TInterval =>
case _: TInterval =>
val i = a.asInstanceOf[Interval[Locus]]
if (i.start.contig == i.end.contig)
s"${ i.start }-${ i.end.position }"
Expand All @@ -443,9 +443,9 @@ object TableAnnotationImpex extends AnnotationImpex[Unit, String] {
case TFloat32 => a.toFloat
case TFloat64 => if (a == "nan") Double.NaN else a.toDouble
case TBoolean => a.toBoolean
case TLocus => Locus.parse(a)
case TInterval => Locus.parseInterval(a)
case TVariant => Variant.parse(a)
case _: TLocus => Locus.parse(a)
case _: TInterval => Locus.parseInterval(a)
case _: TVariant => Variant.parse(a)
case TAltAllele => a.split("/") match {
case Array(ref, alt) => AltAllele(ref, alt)
}
Expand Down
Loading

0 comments on commit 29369c7

Please sign in to comment.