Skip to content

Commit

Permalink
Reference Genome #6: Type fixes (#2208)
Browse files Browse the repository at this point in the history
* Reference Genome #6: Type fixes

* fixed test

* Addressed comments

* addressed comments

* addressed comments
  • Loading branch information
jigold authored and cseed committed Oct 16, 2017
1 parent 9884497 commit 653dbfd
Show file tree
Hide file tree
Showing 12 changed files with 148 additions and 84 deletions.
20 changes: 10 additions & 10 deletions python/hail/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def annotate_genotypes_expr(self, expr):
``expr`` is in genotype context so the following symbols are in scope:
- ``g``: genotype annotation
- ``v`` (*Variant*): :ref:`variant`
- ``v`` (*Variant(GR)*): :ref:`variant(gr)`
- ``va``: variant annotations
- ``s`` (*Sample*): sample
- ``sa``: sample annotations
Expand Down Expand Up @@ -603,7 +603,7 @@ def annotate_variants_expr(self, expr):
``expr`` is in variant context so the following symbols are in scope:
- ``v`` (*Variant*): :ref:`variant`
- ``v`` (*Variant(GR)*): :ref:`variant(gr)`
- ``va``: variant annotations
- ``global``: global annotations
- ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for variant ``v``
Expand Down Expand Up @@ -695,7 +695,7 @@ def annotate_variants_table(self, table, root=None, expr=None, vds_key=None, pro
Each expression in the list ``vds_key`` has the following symbols in
scope:
- ``v`` (*Variant*): :ref:`variant`
- ``v`` (*Variant(GR)*): :ref:`variant(gr)`
- ``va``: variant annotations
**The** ``root`` **and** ``expr`` **arguments**
Expand Down Expand Up @@ -1776,13 +1776,13 @@ def filter_alleles(self, expr, annotation='va = va', subset=True, keep=True,
The following symbols are in scope for ``expr``:
- ``v`` (*Variant*): :ref:`variant`
- ``v`` (*Variant(GR)*): :ref:`variant(gr)`
- ``va``: variant annotations
- ``aIndex`` (*Int*): the index of the allele being tested
The following symbols are in scope for ``annotation``:
- ``v`` (*Variant*): :ref:`variant`
- ``v`` (*Variant(GR)*): :ref:`variant(gr)`
- ``va``: variant annotations
- ``aIndices`` (*Array[Int]*): the array of old indices (such that ``aIndices[newIndex] = oldIndex`` and ``aIndices[0] = 0``)
Expand Down Expand Up @@ -1835,7 +1835,7 @@ def filter_genotypes(self, expr, keep=True):
``expr`` is in genotype context so the following symbols are in scope:
- ``s`` (*Sample*): sample
- ``v`` (*Variant*): :ref:`variant`
- ``v`` (*Variant(GR)*): :ref:`variant(gr)`
- ``sa``: sample annotations
- ``va``: variant annotations
- ``global``: global annotations
Expand Down Expand Up @@ -2065,7 +2065,7 @@ def filter_variants_expr(self, expr, keep=True):
The following symbols are in scope for ``expr``:
- ``v`` (*Variant*): :ref:`variant`
- ``v`` (*Variant(GR)*): :ref:`variant(gr)`
- ``va``: variant annotations
- ``global``: global annotations
- ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for variant ``v``
Expand Down Expand Up @@ -4217,13 +4217,13 @@ def query_variants(self, exprs):
The namespace of the expressions includes:
- ``global``: global annotations
- ``variants`` (*Aggregable[Variant]*): aggregable of :ref:`variant`
- ``variants`` (*Aggregable[Variant(GR)]*): aggregable of :ref:`variant(GR)`
Map and filter expressions on this aggregable have the additional
namespace:
- ``global``: global annotations
- ``v``: :ref:`variant`
- ``v``: :ref:`variant(GR)`
- ``va``: variant annotations
**Performance Note**
Expand Down Expand Up @@ -4277,7 +4277,7 @@ def query_genotypes_typed(self, exprs):
- ``global``: global annotations
- ``g``: :ref:`genotype`
- ``v``: :ref:`variant`
- ``v``: :ref:`variant(GR)`
- ``va``: variant annotations
- ``s``: sample
- ``sa``: sample annotations
Expand Down
6 changes: 3 additions & 3 deletions python/hail/docs/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ Variant Dataset (VDS)
.. image:: misc/hail-vds-rep.png

Hail represents a genetic data set as a matrix where the rows are keyed by
:ref:`variant` objects, the columns are keyed by samples, and each cell is a
:ref:`genotype` object. :ref:`variant` objects and :ref:`genotype` objects each
:ref:`variant(gr)` objects, the columns are keyed by samples, and each cell is a
:ref:`genotype` object. :ref:`variant(gr)` objects and :ref:`genotype` objects each
have methods to access attributes such as chromosome name and genotype call.
Although this representation is similar to the VCF format, Hail uses a fast and
storage-efficient internal representation called a Variant Dataset (**VDS**).
Expand Down Expand Up @@ -64,7 +64,7 @@ The abbreviations for the VDS elements in expressions are as follows:
* - Symbol
- Description
* - ``v``
- :ref:`variant`
- :ref:`variant(gr)`
* - ``s``
- sample
* - ``va``
Expand Down
12 changes: 6 additions & 6 deletions src/main/scala/is/hail/annotations/UnsafeRow.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import com.esotericsoftware.kryo.io.{Input, Output}
import com.esotericsoftware.kryo.{Kryo, KryoSerializable}
import is.hail.expr._
import is.hail.utils._
import is.hail.variant.{AltAllele, GenericGenotype, GenomeReference, Locus, Variant}
import is.hail.variant.{AltAllele, GRBase, GenericGenotype, Locus, Variant}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.Row

Expand Down Expand Up @@ -115,8 +115,8 @@ object UnsafeRow {
def readString(region: MemoryBuffer, boff: Long): String =
new String(readBinary(region, boff))

def readLocus(region: MemoryBuffer, offset: Long): Locus = {
val ft = TLocus(GenomeReference.GRCh37).fundamentalType.asInstanceOf[TStruct]
def readLocus(region: MemoryBuffer, offset: Long, gr: GRBase): Locus = {
val ft = gr.locus.fundamentalType.asInstanceOf[TStruct]
Locus(
readString(region, ft.loadField(region, offset, 0)),
region.loadInt(ft.loadField(region, offset, 1)))
Expand Down Expand Up @@ -189,13 +189,13 @@ object UnsafeRow {
region.loadInt(ft.loadField(region, offset, 1)),
readString(region, ft.loadField(region, offset, 2)),
readArrayAltAllele(region, ft.loadField(region, offset, 3)))
case x: TLocus => readLocus(region, offset)
case x: TLocus => readLocus(region, offset, x.gr)
case TAltAllele => readAltAllele(region, offset)
case x: TInterval =>
val ft = x.fundamentalType.asInstanceOf[TStruct]
Interval[Locus](
readLocus(region, ft.loadField(region, offset, 0)),
readLocus(region, ft.loadField(region, offset, 1)))
readLocus(region, ft.loadField(region, offset, 0), x.gr),
readLocus(region, ft.loadField(region, offset, 1), x.gr))
case TGenotype =>
val ft = TGenotype.fundamentalType.asInstanceOf[TStruct]
val gt: Int =
Expand Down
45 changes: 23 additions & 22 deletions src/main/scala/is/hail/expr/FunctionRegistry.scala
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,8 @@ object FunctionRegistry {
def typ = TTBoxed
}

val GR = GRVariable(GenomeReference.GRCh37)
val GR = GRVariable()

private def nonceToNullable[T : TypeInfo, U >: Null](check: Code[T] => Code[Boolean], v: Code[T], ifPresent: Code[T] => Code[U]): CM[Code[U]] = for (
(stx, x) <- CM.memoize(v)
) yield Code(stx, check(x).mux(Code._null[U], ifPresent(x)))
Expand Down Expand Up @@ -653,7 +654,7 @@ object FunctionRegistry {
"""
Produce an array of called counts for each allele in the variant (including reference). For example, calling this function with a biallelic variant on hom-ref, het, and hom-var calls will produce ``[2, 0]``, ``[1, 1]``, and ``[0, 2]`` respectively.
""",
"v" -> ":ref:`variant`")(callHr, variantHr(GR), arrayHr(int32Hr))
"v" -> ":ref:`variant(gr)`")(callHr, variantHr(GR), arrayHr(int32Hr))
registerMethodSpecial("oneHotGenotype", { (c: () => Any, v: () => Any) =>
val call = c().asInstanceOf[Call]
val variant = v().asInstanceOf[Variant]
Expand All @@ -665,7 +666,7 @@ object FunctionRegistry {
"""
Produces an array with one element for each possible genotype in the variant, where the called genotype is 1 and all else 0. For example, calling this function with a biallelic variant on hom-ref, het, and hom-var calls will produce ``[1, 0, 0]``, ``[0, 1, 0]``, and ``[0, 0, 1]`` respectively.
""",
"v" -> ":ref:`variant`")(callHr, variantHr(GR), arrayHr(int32Hr))
"v" -> ":ref:`variant(gr)`")(callHr, variantHr(GR), arrayHr(int32Hr))

registerFieldCode("gt", { (x: Code[Genotype]) =>
nonceToNullable[Int, java.lang.Integer](_.ceq(-1), x.invoke[Int]("_unboxedGT"), boxInt(_))
Expand Down Expand Up @@ -797,8 +798,8 @@ object FunctionRegistry {
registerMethod("isAutosomal", { (x: Variant) => x.isAutosomal }, "True if chromosome is not X, not Y, and not MT.")(variantHr(GR), boolHr)
registerField("contig", { (x: Locus) => x.contig }, "String representation of contig.")(locusHr(GR), stringHr)
registerField("position", { (x: Locus) => x.position }, "Chromosomal position.")(locusHr(GR), int32Hr)
registerField("start", { (x: Interval[Locus]) => x.start }, ":ref:`locus` at the start of the interval (inclusive).")(locusIntervalHr(GR), locusHr(GR))
registerField("end", { (x: Interval[Locus]) => x.end }, ":ref:`locus` at the end of the interval (exclusive).")(locusIntervalHr(GR), locusHr(GR))
registerField("start", { (x: Interval[Locus]) => x.start }, ":ref:`locus(gr)` at the start of the interval (inclusive).")(locusIntervalHr(GR), locusHr(GR))
registerField("end", { (x: Interval[Locus]) => x.end }, ":ref:`locus(gr)` at the end of the interval (exclusive).")(locusIntervalHr(GR), locusHr(GR))
registerField("ref", { (x: AltAllele) => x.ref }, "Reference allele base sequence.")
registerField("alt", { (x: AltAllele) => x.alt }, "Alternate allele base sequence.")
registerMethod("isSNP", { (x: AltAllele) => x.isSNP }, "True if ``v.ref`` and ``v.alt`` are the same length and differ in one position.")
Expand Down Expand Up @@ -973,18 +974,18 @@ object FunctionRegistry {

register("Variant", { (x: String) => Variant.parse(x) },
"""
Construct a :ref:`variant` object.
Construct a :ref:`variant(gr)` object.
.. code-block:: text
:emphasize-lines: 2
let v = Variant("7:76324539:A:G") in v.contig
result: "7"
""",
"s" -> "String of the form ``CHR:POS:REF:ALT`` or ``CHR:POS:REF:ALT1,ALT2...ALTN`` specifying the contig, position, reference and alternate alleles.")(stringHr, variantHr(GR))
"s" -> "String of the form ``CHR:POS:REF:ALT`` or ``CHR:POS:REF:ALT1,ALT2...ALTN`` specifying the contig, position, reference and alternate alleles.")(stringHr, variantHr(GenomeReference.GRCh37))
register("Variant", { (x: String, y: Int, z: String, a: String) => Variant(x, y, z, a) },
"""
Construct a :ref:`variant` object.
Construct a :ref:`variant(gr)` object.
.. code-block:: text
:emphasize-lines: 2
Expand All @@ -995,10 +996,10 @@ object FunctionRegistry {
"contig" -> "String representation of contig.",
"pos" -> "SNP position or start of an indel.",
"ref" -> "Reference allele sequence.",
"alt" -> "Alternate allele sequence.")(stringHr, int32Hr, stringHr, stringHr, variantHr(GR))
"alt" -> "Alternate allele sequence.")(stringHr, int32Hr, stringHr, stringHr, variantHr(GenomeReference.GRCh37))
register("Variant", { (x: String, y: Int, z: String, a: IndexedSeq[String]) => Variant(x, y, z, a.toArray) },
"""
Construct a :ref:`variant` object.
Construct a :ref:`variant(gr)` object.
.. code-block:: text
:emphasize-lines: 2
Expand All @@ -1010,7 +1011,7 @@ object FunctionRegistry {
"pos" -> "SNP position or start of an indel.",
"ref" -> "Reference allele sequence.",
"alts" -> "Array of alternate allele sequences."
)(stringHr, int32Hr, stringHr, arrayHr(stringHr), variantHr(GR))
)(stringHr, int32Hr, stringHr, arrayHr(stringHr), variantHr(GenomeReference.GRCh37))

register("Dict", { (keys: IndexedSeq[Annotation], values: IndexedSeq[Annotation]) =>
if (keys.length != values.length)
Expand All @@ -1020,7 +1021,7 @@ object FunctionRegistry {
"keys" -> "Keys of Dict.",
"values" -> "Values of Dict.")(arrayHr(TTHr), arrayHr(TUHr), dictHr(TTHr, TUHr))

val combineVariantsStruct = TStruct(Array(("variant", TVariant(GenomeReference.GRCh37), "Resulting combined variant."),
val combineVariantsStruct = TStruct(Array(("variant", TVariant(GR), "Resulting combined variant."),
("laIndices", TDict(TInt32, TInt32), "Mapping from new to old allele index for the left variant."),
("raIndices", TDict(TInt32, TInt32), "Mapping from new to old allele index for the right variant.")
).zipWithIndex.map { case ((n, t, d), i) => Field(n, t, i, Map(("desc", d))) })
Expand Down Expand Up @@ -1076,7 +1077,7 @@ object FunctionRegistry {
Locus(chr, pos.toInt)
},
"""
Construct a :ref:`locus` object.
Construct a :ref:`locus(gr)` object.
.. code-block:: text
:emphasize-lines: 2
Expand All @@ -1085,11 +1086,11 @@ object FunctionRegistry {
result: 10040532
""",
("s", "String of the form ``CHR:POS``")
)(stringHr, locusHr(GR))
)(stringHr, locusHr(GenomeReference.GRCh37))

register("Locus", { (x: String, y: Int) => Locus(x, y) },
"""
Construct a :ref:`locus` object.
Construct a :ref:`locus(gr)` object.
.. code-block:: text
:emphasize-lines: 2
Expand All @@ -1098,10 +1099,10 @@ object FunctionRegistry {
result: 10040532
""",
"contig" -> "String representation of contig.",
"pos" -> "SNP position or start of an indel.")(stringHr, int32Hr, locusHr(GR))
"pos" -> "SNP position or start of an indel.")(stringHr, int32Hr, locusHr(GenomeReference.GRCh37))
register("Interval", { (x: Locus, y: Locus) => Interval(x, y) },
"""
Construct a :ref:`interval` object. Intervals are **left inclusive, right exclusive**. This means that ``[chr1:1, chr1:3)`` contains ``chr1:1`` and ``chr1:2``.
Construct a :ref:`interval(gr)` object. Intervals are **left inclusive, right exclusive**. This means that ``[chr1:1, chr1:3)`` contains ``chr1:1`` and ``chr1:2``.
""",
"startLocus" -> "Start position of interval",
"endLocus" -> "End position of interval")(locusHr(GR), locusHr(GR), locusIntervalHr(GR))
Expand Down Expand Up @@ -1264,15 +1265,15 @@ object FunctionRegistry {
Returns an interval parsed in the same way as :py:meth:`~hail.representation.Interval.parse`
""",
"s" -> "The string to parse."
)(stringHr, locusIntervalHr(GR))
)(stringHr, locusIntervalHr(GenomeReference.GRCh37))

register("Interval", (chr: String, start: Int, end: Int) => Interval(Locus(chr, start), Locus(chr, end)),
"""
Constructs an interval from a given chromosome, start, and end.
""",
"chr" -> "Chromosome.",
"start" -> "Starting position.",
"end" -> "Ending position (exclusive).")(stringHr, int32Hr, int32Hr, locusIntervalHr(GR))
"end" -> "Ending position (exclusive).")(stringHr, int32Hr, int32Hr, locusIntervalHr(GenomeReference.GRCh37))

register("pcoin", { (p: Double) => math.random < p },
"""
Expand Down Expand Up @@ -1511,13 +1512,13 @@ object FunctionRegistry {
"""
Produce an array of called counts for each allele in the variant (including reference). For example, calling this function with a biallelic variant on hom-ref, het, and hom-var genotypes will produce ``[2, 0]``, ``[1, 1]``, and ``[0, 2]`` respectively.
""",
"v" -> ":ref:`variant`")(genotypeHr, variantHr(GR), arrayHr(int32Hr))
"v" -> ":ref:`variant(gr)`")(genotypeHr, variantHr(GR), arrayHr(int32Hr))

registerMethod("oneHotGenotype", (g: Genotype, v: Variant) => Genotype.oneHotGenotype(v, g).orNull,
"""
Produces an array with one element for each possible genotype in the variant, where the called genotype is 1 and all else 0. For example, calling this function with a biallelic variant on hom-ref, het, and hom-var genotypes will produce ``[1, 0, 0]``, ``[0, 1, 0]``, and ``[0, 0, 1]`` respectively.
""",
"v" -> ":ref:`variant`"
"v" -> ":ref:`variant(gr)`"
)(genotypeHr, variantHr(GR), arrayHr(int32Hr))

registerMethod("replace", (str: String, pattern1: String, pattern2: String) =>
Expand All @@ -1544,7 +1545,7 @@ object FunctionRegistry {
let i = Interval(Locus("1", 1000), Locus("1", 2000)) in i.contains(Locus("1", 1500))
result: true
""",
"locus" -> ":ref:`locus`")(locusIntervalHr(GR), locusHr(GR), boolHr)
"locus" -> ":ref:`locus(gr)`")(locusIntervalHr(GR), locusHr(GR), boolHr)

val sizeDocstring = "Number of elements in the collection."
registerMethod("length", (a: IndexedSeq[Any]) => a.length, sizeDocstring)(arrayHr(TTHr), int32Hr)
Expand Down
14 changes: 7 additions & 7 deletions src/main/scala/is/hail/expr/HailRep.scala
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package is.hail.expr

import is.hail.utils.Interval
import is.hail.variant.{AltAllele, Call, GRVariable, Genotype, Locus, Variant}
import is.hail.variant.{AltAllele, Call, GRBase, GRVariable, Genotype, Locus, Variant}

trait HailRep[T] { self =>
def typ: Type
Expand Down Expand Up @@ -61,20 +61,20 @@ trait HailRepFunctions {
def typ = TGenotype
}

implicit class variantHr(gr: GRVariable) extends HailRep[Variant] {
def typ = TVariant(gr.gr)
implicit class variantHr(gr: GRBase) extends HailRep[Variant] {
def typ = TVariant(gr)
}

implicit class locusHr(gr: GRVariable) extends HailRep[Locus] {
def typ = TLocus(gr.gr)
implicit class locusHr(gr: GRBase) extends HailRep[Locus] {
def typ = TLocus(gr)
}

implicit object altAlleleHr extends HailRep[AltAllele] {
def typ = TAltAllele
}

implicit class locusIntervalHr(gr: GRVariable) extends HailRep[Interval[Locus]] {
def typ = TInterval(gr.gr)
implicit class locusIntervalHr(gr: GRBase) extends HailRep[Interval[Locus]] {
def typ = TInterval(gr)
}

implicit def arrayHr[T](implicit hrt: HailRep[T]) = new HailRep[IndexedSeq[T]] {
Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/is/hail/expr/Parser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ object Parser extends JavaTokenParsers {

def type_expr: Parser[Type] =
"Empty" ^^ { _ => TStruct.empty } |
"Interval" ^^ { _ => TInterval(GenomeReference.GRCh37) } |
("Interval" ~ "(") ~> identifier <~ ")" ^^ { id => GenomeReference.getReference(id).interval } |
"Boolean" ^^ { _ => TBoolean } |
"Int32" ^^ { _ => TInt32 } |
"Int64" ^^ { _ => TInt64 } |
Expand All @@ -542,8 +542,8 @@ object Parser extends JavaTokenParsers {
"Float" ^^ { _ => TFloat64 } |
"String" ^^ { _ => TString } |
"AltAllele" ^^ { _ => TAltAllele } |
"Variant" ^^ { _ => TVariant(GenomeReference.GRCh37) } |
"Locus" ^^ { _ => TLocus(GenomeReference.GRCh37) } |
("Variant" ~ "(") ~> identifier <~ ")" ^^ { id => GenomeReference.getReference(id).variant } |
("Locus" ~ "(") ~> identifier <~ ")" ^^ { id => GenomeReference.getReference(id).locus } |
"Genotype" ^^ { _ => TGenotype } |
"Call" ^^ { _ => TCall } |
("Array" ~ "[") ~> type_expr <~ "]" ^^ { elementType => TArray(elementType) } |
Expand Down
Loading

0 comments on commit 653dbfd

Please sign in to comment.