diff --git a/python/hail/dataset.py b/python/hail/dataset.py index b0d06e9d461..1cc21ce5ca0 100644 --- a/python/hail/dataset.py +++ b/python/hail/dataset.py @@ -236,7 +236,7 @@ def annotate_genotypes_expr(self, expr): ``expr`` is in genotype context so the following symbols are in scope: - ``g``: genotype annotation - - ``v`` (*Variant*): :ref:`variant` + - ``v`` (*Variant(GR)*): :ref:`variant(gr)` - ``va``: variant annotations - ``s`` (*Sample*): sample - ``sa``: sample annotations @@ -603,7 +603,7 @@ def annotate_variants_expr(self, expr): ``expr`` is in variant context so the following symbols are in scope: - - ``v`` (*Variant*): :ref:`variant` + - ``v`` (*Variant(GR)*): :ref:`variant(gr)` - ``va``: variant annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for variant ``v`` @@ -695,7 +695,7 @@ def annotate_variants_table(self, table, root=None, expr=None, vds_key=None, pro Each expression in the list ``vds_key`` has the following symbols in scope: - - ``v`` (*Variant*): :ref:`variant` + - ``v`` (*Variant(GR)*): :ref:`variant(gr)` - ``va``: variant annotations **The** ``root`` **and** ``expr`` **arguments** @@ -1776,13 +1776,13 @@ def filter_alleles(self, expr, annotation='va = va', subset=True, keep=True, The following symbols are in scope for ``expr``: - - ``v`` (*Variant*): :ref:`variant` + - ``v`` (*Variant(GR)*): :ref:`variant(gr)` - ``va``: variant annotations - ``aIndex`` (*Int*): the index of the allele being tested The following symbols are in scope for ``annotation``: - - ``v`` (*Variant*): :ref:`variant` + - ``v`` (*Variant(GR)*): :ref:`variant(gr)` - ``va``: variant annotations - ``aIndices`` (*Array[Int]*): the array of old indices (such that ``aIndices[newIndex] = oldIndex`` and ``aIndices[0] = 0``) @@ -1835,7 +1835,7 @@ def filter_genotypes(self, expr, keep=True): ``expr`` is in genotype context so the following symbols are in scope: - ``s`` (*Sample*): sample - - ``v`` (*Variant*): :ref:`variant` + - ``v`` (*Variant(GR)*): :ref:`variant(gr)` - ``sa``: sample annotations - ``va``: variant annotations - ``global``: global annotations @@ -2065,7 +2065,7 @@ def filter_variants_expr(self, expr, keep=True): The following symbols are in scope for ``expr``: - - ``v`` (*Variant*): :ref:`variant` + - ``v`` (*Variant(GR)*): :ref:`variant(gr)` - ``va``: variant annotations - ``global``: global annotations - ``gs`` (*Aggregable[Genotype]*): aggregable of :ref:`genotype` for variant ``v`` @@ -4217,13 +4217,13 @@ def query_variants(self, exprs): The namespace of the expressions includes: - ``global``: global annotations - - ``variants`` (*Aggregable[Variant]*): aggregable of :ref:`variant` + - ``variants`` (*Aggregable[Variant(GR)]*): aggregable of :ref:`variant(GR)` Map and filter expressions on this aggregable have the additional namespace: - ``global``: global annotations - - ``v``: :ref:`variant` + - ``v``: :ref:`variant(GR)` - ``va``: variant annotations **Performance Note** @@ -4277,7 +4277,7 @@ def query_genotypes_typed(self, exprs): - ``global``: global annotations - ``g``: :ref:`genotype` - - ``v``: :ref:`variant` + - ``v``: :ref:`variant(GR)` - ``va``: variant annotations - ``s``: sample - ``sa``: sample annotations diff --git a/python/hail/docs/overview.rst b/python/hail/docs/overview.rst index 6b098fdcef2..f19f672dbba 100644 --- a/python/hail/docs/overview.rst +++ b/python/hail/docs/overview.rst @@ -18,8 +18,8 @@ Variant Dataset (VDS) .. image:: misc/hail-vds-rep.png Hail represents a genetic data set as a matrix where the rows are keyed by -:ref:`variant` objects, the columns are keyed by samples, and each cell is a -:ref:`genotype` object. :ref:`variant` objects and :ref:`genotype` objects each +:ref:`variant(gr)` objects, the columns are keyed by samples, and each cell is a +:ref:`genotype` object. :ref:`variant(gr)` objects and :ref:`genotype` objects each have methods to access attributes such as chromosome name and genotype call. Although this representation is similar to the VCF format, Hail uses a fast and storage-efficient internal representation called a Variant Dataset (**VDS**). @@ -64,7 +64,7 @@ The abbreviations for the VDS elements in expressions are as follows: * - Symbol - Description * - ``v`` - - :ref:`variant` + - :ref:`variant(gr)` * - ``s`` - sample * - ``va`` diff --git a/src/main/scala/is/hail/annotations/UnsafeRow.scala b/src/main/scala/is/hail/annotations/UnsafeRow.scala index 59371bcd18e..1a5b72e8101 100644 --- a/src/main/scala/is/hail/annotations/UnsafeRow.scala +++ b/src/main/scala/is/hail/annotations/UnsafeRow.scala @@ -6,7 +6,7 @@ import com.esotericsoftware.kryo.io.{Input, Output} import com.esotericsoftware.kryo.{Kryo, KryoSerializable} import is.hail.expr._ import is.hail.utils._ -import is.hail.variant.{AltAllele, GenericGenotype, GenomeReference, Locus, Variant} +import is.hail.variant.{AltAllele, GRBase, GenericGenotype, Locus, Variant} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.Row @@ -115,8 +115,8 @@ object UnsafeRow { def readString(region: MemoryBuffer, boff: Long): String = new String(readBinary(region, boff)) - def readLocus(region: MemoryBuffer, offset: Long): Locus = { - val ft = TLocus(GenomeReference.GRCh37).fundamentalType.asInstanceOf[TStruct] + def readLocus(region: MemoryBuffer, offset: Long, gr: GRBase): Locus = { + val ft = gr.locus.fundamentalType.asInstanceOf[TStruct] Locus( readString(region, ft.loadField(region, offset, 0)), region.loadInt(ft.loadField(region, offset, 1))) @@ -189,13 +189,13 @@ object UnsafeRow { region.loadInt(ft.loadField(region, offset, 1)), readString(region, ft.loadField(region, offset, 2)), readArrayAltAllele(region, ft.loadField(region, offset, 3))) - case x: TLocus => readLocus(region, offset) + case x: TLocus => readLocus(region, offset, x.gr) case TAltAllele => readAltAllele(region, offset) case x: TInterval => val ft = x.fundamentalType.asInstanceOf[TStruct] Interval[Locus]( - readLocus(region, ft.loadField(region, offset, 0)), - readLocus(region, ft.loadField(region, offset, 1))) + readLocus(region, ft.loadField(region, offset, 0), x.gr), + readLocus(region, ft.loadField(region, offset, 1), x.gr)) case TGenotype => val ft = TGenotype.fundamentalType.asInstanceOf[TStruct] val gt: Int = diff --git a/src/main/scala/is/hail/expr/FunctionRegistry.scala b/src/main/scala/is/hail/expr/FunctionRegistry.scala index 67270a2597e..8e1aab5fad1 100644 --- a/src/main/scala/is/hail/expr/FunctionRegistry.scala +++ b/src/main/scala/is/hail/expr/FunctionRegistry.scala @@ -622,7 +622,8 @@ object FunctionRegistry { def typ = TTBoxed } - val GR = GRVariable(GenomeReference.GRCh37) + val GR = GRVariable() + private def nonceToNullable[T : TypeInfo, U >: Null](check: Code[T] => Code[Boolean], v: Code[T], ifPresent: Code[T] => Code[U]): CM[Code[U]] = for ( (stx, x) <- CM.memoize(v) ) yield Code(stx, check(x).mux(Code._null[U], ifPresent(x))) @@ -653,7 +654,7 @@ object FunctionRegistry { """ Produce an array of called counts for each allele in the variant (including reference). For example, calling this function with a biallelic variant on hom-ref, het, and hom-var calls will produce ``[2, 0]``, ``[1, 1]``, and ``[0, 2]`` respectively. """, - "v" -> ":ref:`variant`")(callHr, variantHr(GR), arrayHr(int32Hr)) + "v" -> ":ref:`variant(gr)`")(callHr, variantHr(GR), arrayHr(int32Hr)) registerMethodSpecial("oneHotGenotype", { (c: () => Any, v: () => Any) => val call = c().asInstanceOf[Call] val variant = v().asInstanceOf[Variant] @@ -665,7 +666,7 @@ object FunctionRegistry { """ Produces an array with one element for each possible genotype in the variant, where the called genotype is 1 and all else 0. For example, calling this function with a biallelic variant on hom-ref, het, and hom-var calls will produce ``[1, 0, 0]``, ``[0, 1, 0]``, and ``[0, 0, 1]`` respectively. """, - "v" -> ":ref:`variant`")(callHr, variantHr(GR), arrayHr(int32Hr)) + "v" -> ":ref:`variant(gr)`")(callHr, variantHr(GR), arrayHr(int32Hr)) registerFieldCode("gt", { (x: Code[Genotype]) => nonceToNullable[Int, java.lang.Integer](_.ceq(-1), x.invoke[Int]("_unboxedGT"), boxInt(_)) @@ -797,8 +798,8 @@ object FunctionRegistry { registerMethod("isAutosomal", { (x: Variant) => x.isAutosomal }, "True if chromosome is not X, not Y, and not MT.")(variantHr(GR), boolHr) registerField("contig", { (x: Locus) => x.contig }, "String representation of contig.")(locusHr(GR), stringHr) registerField("position", { (x: Locus) => x.position }, "Chromosomal position.")(locusHr(GR), int32Hr) - registerField("start", { (x: Interval[Locus]) => x.start }, ":ref:`locus` at the start of the interval (inclusive).")(locusIntervalHr(GR), locusHr(GR)) - registerField("end", { (x: Interval[Locus]) => x.end }, ":ref:`locus` at the end of the interval (exclusive).")(locusIntervalHr(GR), locusHr(GR)) + registerField("start", { (x: Interval[Locus]) => x.start }, ":ref:`locus(gr)` at the start of the interval (inclusive).")(locusIntervalHr(GR), locusHr(GR)) + registerField("end", { (x: Interval[Locus]) => x.end }, ":ref:`locus(gr)` at the end of the interval (exclusive).")(locusIntervalHr(GR), locusHr(GR)) registerField("ref", { (x: AltAllele) => x.ref }, "Reference allele base sequence.") registerField("alt", { (x: AltAllele) => x.alt }, "Alternate allele base sequence.") registerMethod("isSNP", { (x: AltAllele) => x.isSNP }, "True if ``v.ref`` and ``v.alt`` are the same length and differ in one position.") @@ -973,7 +974,7 @@ object FunctionRegistry { register("Variant", { (x: String) => Variant.parse(x) }, """ - Construct a :ref:`variant` object. + Construct a :ref:`variant(gr)` object. .. code-block:: text :emphasize-lines: 2 @@ -981,10 +982,10 @@ object FunctionRegistry { let v = Variant("7:76324539:A:G") in v.contig result: "7" """, - "s" -> "String of the form ``CHR:POS:REF:ALT`` or ``CHR:POS:REF:ALT1,ALT2...ALTN`` specifying the contig, position, reference and alternate alleles.")(stringHr, variantHr(GR)) + "s" -> "String of the form ``CHR:POS:REF:ALT`` or ``CHR:POS:REF:ALT1,ALT2...ALTN`` specifying the contig, position, reference and alternate alleles.")(stringHr, variantHr(GenomeReference.GRCh37)) register("Variant", { (x: String, y: Int, z: String, a: String) => Variant(x, y, z, a) }, """ - Construct a :ref:`variant` object. + Construct a :ref:`variant(gr)` object. .. code-block:: text :emphasize-lines: 2 @@ -995,10 +996,10 @@ object FunctionRegistry { "contig" -> "String representation of contig.", "pos" -> "SNP position or start of an indel.", "ref" -> "Reference allele sequence.", - "alt" -> "Alternate allele sequence.")(stringHr, int32Hr, stringHr, stringHr, variantHr(GR)) + "alt" -> "Alternate allele sequence.")(stringHr, int32Hr, stringHr, stringHr, variantHr(GenomeReference.GRCh37)) register("Variant", { (x: String, y: Int, z: String, a: IndexedSeq[String]) => Variant(x, y, z, a.toArray) }, """ - Construct a :ref:`variant` object. + Construct a :ref:`variant(gr)` object. .. code-block:: text :emphasize-lines: 2 @@ -1010,7 +1011,7 @@ object FunctionRegistry { "pos" -> "SNP position or start of an indel.", "ref" -> "Reference allele sequence.", "alts" -> "Array of alternate allele sequences." - )(stringHr, int32Hr, stringHr, arrayHr(stringHr), variantHr(GR)) + )(stringHr, int32Hr, stringHr, arrayHr(stringHr), variantHr(GenomeReference.GRCh37)) register("Dict", { (keys: IndexedSeq[Annotation], values: IndexedSeq[Annotation]) => if (keys.length != values.length) @@ -1020,7 +1021,7 @@ object FunctionRegistry { "keys" -> "Keys of Dict.", "values" -> "Values of Dict.")(arrayHr(TTHr), arrayHr(TUHr), dictHr(TTHr, TUHr)) - val combineVariantsStruct = TStruct(Array(("variant", TVariant(GenomeReference.GRCh37), "Resulting combined variant."), + val combineVariantsStruct = TStruct(Array(("variant", TVariant(GR), "Resulting combined variant."), ("laIndices", TDict(TInt32, TInt32), "Mapping from new to old allele index for the left variant."), ("raIndices", TDict(TInt32, TInt32), "Mapping from new to old allele index for the right variant.") ).zipWithIndex.map { case ((n, t, d), i) => Field(n, t, i, Map(("desc", d))) }) @@ -1076,7 +1077,7 @@ object FunctionRegistry { Locus(chr, pos.toInt) }, """ - Construct a :ref:`locus` object. + Construct a :ref:`locus(gr)` object. .. code-block:: text :emphasize-lines: 2 @@ -1085,11 +1086,11 @@ object FunctionRegistry { result: 10040532 """, ("s", "String of the form ``CHR:POS``") - )(stringHr, locusHr(GR)) + )(stringHr, locusHr(GenomeReference.GRCh37)) register("Locus", { (x: String, y: Int) => Locus(x, y) }, """ - Construct a :ref:`locus` object. + Construct a :ref:`locus(gr)` object. .. code-block:: text :emphasize-lines: 2 @@ -1098,10 +1099,10 @@ object FunctionRegistry { result: 10040532 """, "contig" -> "String representation of contig.", - "pos" -> "SNP position or start of an indel.")(stringHr, int32Hr, locusHr(GR)) + "pos" -> "SNP position or start of an indel.")(stringHr, int32Hr, locusHr(GenomeReference.GRCh37)) register("Interval", { (x: Locus, y: Locus) => Interval(x, y) }, """ - Construct a :ref:`interval` object. Intervals are **left inclusive, right exclusive**. This means that ``[chr1:1, chr1:3)`` contains ``chr1:1`` and ``chr1:2``. + Construct a :ref:`interval(gr)` object. Intervals are **left inclusive, right exclusive**. This means that ``[chr1:1, chr1:3)`` contains ``chr1:1`` and ``chr1:2``. """, "startLocus" -> "Start position of interval", "endLocus" -> "End position of interval")(locusHr(GR), locusHr(GR), locusIntervalHr(GR)) @@ -1264,7 +1265,7 @@ object FunctionRegistry { Returns an interval parsed in the same way as :py:meth:`~hail.representation.Interval.parse` """, "s" -> "The string to parse." - )(stringHr, locusIntervalHr(GR)) + )(stringHr, locusIntervalHr(GenomeReference.GRCh37)) register("Interval", (chr: String, start: Int, end: Int) => Interval(Locus(chr, start), Locus(chr, end)), """ @@ -1272,7 +1273,7 @@ object FunctionRegistry { """, "chr" -> "Chromosome.", "start" -> "Starting position.", - "end" -> "Ending position (exclusive).")(stringHr, int32Hr, int32Hr, locusIntervalHr(GR)) + "end" -> "Ending position (exclusive).")(stringHr, int32Hr, int32Hr, locusIntervalHr(GenomeReference.GRCh37)) register("pcoin", { (p: Double) => math.random < p }, """ @@ -1511,13 +1512,13 @@ object FunctionRegistry { """ Produce an array of called counts for each allele in the variant (including reference). For example, calling this function with a biallelic variant on hom-ref, het, and hom-var genotypes will produce ``[2, 0]``, ``[1, 1]``, and ``[0, 2]`` respectively. """, - "v" -> ":ref:`variant`")(genotypeHr, variantHr(GR), arrayHr(int32Hr)) + "v" -> ":ref:`variant(gr)`")(genotypeHr, variantHr(GR), arrayHr(int32Hr)) registerMethod("oneHotGenotype", (g: Genotype, v: Variant) => Genotype.oneHotGenotype(v, g).orNull, """ Produces an array with one element for each possible genotype in the variant, where the called genotype is 1 and all else 0. For example, calling this function with a biallelic variant on hom-ref, het, and hom-var genotypes will produce ``[1, 0, 0]``, ``[0, 1, 0]``, and ``[0, 0, 1]`` respectively. """, - "v" -> ":ref:`variant`" + "v" -> ":ref:`variant(gr)`" )(genotypeHr, variantHr(GR), arrayHr(int32Hr)) registerMethod("replace", (str: String, pattern1: String, pattern2: String) => @@ -1544,7 +1545,7 @@ object FunctionRegistry { let i = Interval(Locus("1", 1000), Locus("1", 2000)) in i.contains(Locus("1", 1500)) result: true """, - "locus" -> ":ref:`locus`")(locusIntervalHr(GR), locusHr(GR), boolHr) + "locus" -> ":ref:`locus(gr)`")(locusIntervalHr(GR), locusHr(GR), boolHr) val sizeDocstring = "Number of elements in the collection." registerMethod("length", (a: IndexedSeq[Any]) => a.length, sizeDocstring)(arrayHr(TTHr), int32Hr) diff --git a/src/main/scala/is/hail/expr/HailRep.scala b/src/main/scala/is/hail/expr/HailRep.scala index 7c8c482a34d..9a7448c8938 100644 --- a/src/main/scala/is/hail/expr/HailRep.scala +++ b/src/main/scala/is/hail/expr/HailRep.scala @@ -1,7 +1,7 @@ package is.hail.expr import is.hail.utils.Interval -import is.hail.variant.{AltAllele, Call, GRVariable, Genotype, Locus, Variant} +import is.hail.variant.{AltAllele, Call, GRBase, GRVariable, Genotype, Locus, Variant} trait HailRep[T] { self => def typ: Type @@ -61,20 +61,20 @@ trait HailRepFunctions { def typ = TGenotype } - implicit class variantHr(gr: GRVariable) extends HailRep[Variant] { - def typ = TVariant(gr.gr) + implicit class variantHr(gr: GRBase) extends HailRep[Variant] { + def typ = TVariant(gr) } - implicit class locusHr(gr: GRVariable) extends HailRep[Locus] { - def typ = TLocus(gr.gr) + implicit class locusHr(gr: GRBase) extends HailRep[Locus] { + def typ = TLocus(gr) } implicit object altAlleleHr extends HailRep[AltAllele] { def typ = TAltAllele } - implicit class locusIntervalHr(gr: GRVariable) extends HailRep[Interval[Locus]] { - def typ = TInterval(gr.gr) + implicit class locusIntervalHr(gr: GRBase) extends HailRep[Interval[Locus]] { + def typ = TInterval(gr) } implicit def arrayHr[T](implicit hrt: HailRep[T]) = new HailRep[IndexedSeq[T]] { diff --git a/src/main/scala/is/hail/expr/Parser.scala b/src/main/scala/is/hail/expr/Parser.scala index c29cfb05109..b72ed1f4499 100644 --- a/src/main/scala/is/hail/expr/Parser.scala +++ b/src/main/scala/is/hail/expr/Parser.scala @@ -532,7 +532,7 @@ object Parser extends JavaTokenParsers { def type_expr: Parser[Type] = "Empty" ^^ { _ => TStruct.empty } | - "Interval" ^^ { _ => TInterval(GenomeReference.GRCh37) } | + ("Interval" ~ "(") ~> identifier <~ ")" ^^ { id => GenomeReference.getReference(id).interval } | "Boolean" ^^ { _ => TBoolean } | "Int32" ^^ { _ => TInt32 } | "Int64" ^^ { _ => TInt64 } | @@ -542,8 +542,8 @@ object Parser extends JavaTokenParsers { "Float" ^^ { _ => TFloat64 } | "String" ^^ { _ => TString } | "AltAllele" ^^ { _ => TAltAllele } | - "Variant" ^^ { _ => TVariant(GenomeReference.GRCh37) } | - "Locus" ^^ { _ => TLocus(GenomeReference.GRCh37) } | + ("Variant" ~ "(") ~> identifier <~ ")" ^^ { id => GenomeReference.getReference(id).variant } | + ("Locus" ~ "(") ~> identifier <~ ")" ^^ { id => GenomeReference.getReference(id).locus } | "Genotype" ^^ { _ => TGenotype } | "Call" ^^ { _ => TCall } | ("Array" ~ "[") ~> type_expr <~ "]" ^^ { elementType => TArray(elementType) } | diff --git a/src/main/scala/is/hail/expr/Type.scala b/src/main/scala/is/hail/expr/Type.scala index 82cf18506b2..97076dccc27 100644 --- a/src/main/scala/is/hail/expr/Type.scala +++ b/src/main/scala/is/hail/expr/Type.scala @@ -6,7 +6,7 @@ import is.hail.check.{Gen, _} import is.hail.sparkextras.OrderedKey import is.hail.utils import is.hail.utils.{Interval, StringEscapeUtils, _} -import is.hail.variant.{AltAllele, Call, Contig, GenomeReference, Genotype, Locus, Variant} +import is.hail.variant.{AltAllele, Call, Contig, GRBase, GenomeReference, Genotype, Locus, Variant} import org.apache.spark.sql.Row import org.apache.spark.sql.types.DataType import org.json4s._ @@ -982,8 +982,8 @@ case object TAltAllele extends ComplexType { "alt" -> TString) } -case class TVariant(gr: GenomeReference) extends ComplexType { - override def toString = "Variant" +case class TVariant(gr: GRBase) extends ComplexType { + override def toString = s"""Variant($gr)""" def typeCheck(a: Any): Boolean = a == null || a.isInstanceOf[Variant] @@ -991,7 +991,7 @@ case class TVariant(gr: GenomeReference) extends ComplexType { override def desc: String = """ - A ``Variant`` is a Hail data type representing a variant in the Variant Dataset. It is referred to as ``v`` in the expression language. + A ``Variant(GR)`` is a Hail data type representing a variant in the dataset. It is parameterized by a genome reference (GR) such as GRCh37 or GRCh38. It is referred to as ``v`` in the expression language. The `pseudoautosomal region `_ (PAR) is currently defined with respect to reference `GRCh37 `_: @@ -1063,16 +1063,20 @@ case class TVariant(gr: GenomeReference) extends ComplexType { case TVariant(cgr) => gr.unify(cgr) case _ => false } + + override def clear(): Unit = gr.clear() + + override def subst() = gr.subst().variant } -case class TLocus(gr: GenomeReference) extends ComplexType { - override def toString = "Locus" +case class TLocus(gr: GRBase) extends ComplexType { + override def toString = s"Locus($gr)" def typeCheck(a: Any): Boolean = a == null || a.isInstanceOf[Locus] override def genNonmissingValue: Gen[Annotation] = Locus.gen - override def desc: String = "A ``Locus`` is a Hail data type representing a specific genomic location in the Variant Dataset." + override def desc: String = "A ``Locus(GR)`` is a Hail data type representing a specific genomic location in the Variant Dataset. It is parameterized by a genome reference (GR) such as GRCh37 or GRCh38." override def scalaClassTag: ClassTag[Locus] = classTag[Locus] @@ -1111,16 +1115,20 @@ case class TLocus(gr: GenomeReference) extends ComplexType { case TLocus(cgr) => gr.unify(cgr) case _ => false } + + override def clear(): Unit = gr.clear() + + override def subst() = gr.subst().locus } -case class TInterval(gr: GenomeReference) extends ComplexType { - override def toString = "Interval" +case class TInterval(gr: GRBase) extends ComplexType { + override def toString = s"""Interval($gr)""" def typeCheck(a: Any): Boolean = a == null || a.isInstanceOf[Interval[_]] && a.asInstanceOf[Interval[_]].end.isInstanceOf[Locus] override def genNonmissingValue: Gen[Annotation] = Interval.gen(Locus.gen) - override def desc: String = "An ``Interval`` is a Hail data type representing a range of genomic locations in the Variant Dataset." + override def desc: String = "An ``Interval(GR)`` is a Hail data type representing a range of genomic locations in the dataset. It is parameterized by a genome reference (GR) such as GRCh37 or GRCh38." override def scalaClassTag: ClassTag[Interval[Locus]] = classTag[Interval[Locus]] @@ -1156,6 +1164,10 @@ case class TInterval(gr: GenomeReference) extends ComplexType { case TInterval(cgr) => gr.unify(cgr) case _ => false } + + override def clear(): Unit = gr.clear() + + override def subst() = gr.subst().interval } final case class Field(name: String, typ: Type, diff --git a/src/main/scala/is/hail/methods/MendelErrors.scala b/src/main/scala/is/hail/methods/MendelErrors.scala index 6d05fe0e041..918ce6bf77a 100644 --- a/src/main/scala/is/hail/methods/MendelErrors.scala +++ b/src/main/scala/is/hail/methods/MendelErrors.scala @@ -121,7 +121,8 @@ object MendelErrors { case class MendelErrors(hc: HailContext, trios: IndexedSeq[CompleteTrio], sampleIds: IndexedSeq[String], - mendelErrors: RDD[MendelError]) { + mendelErrors: RDD[MendelError], + gr: GRBase = GenomeReference.GRCh37) { private val sc = mendelErrors.sparkContext private val trioFam = trios.iterator.flatMap(t => t.fam.map(f => (t.kid, f))).toMap diff --git a/src/main/scala/is/hail/utils/FunctionDocumentation.scala b/src/main/scala/is/hail/utils/FunctionDocumentation.scala index 31933bc4136..2d8ca0231d0 100644 --- a/src/main/scala/is/hail/utils/FunctionDocumentation.scala +++ b/src/main/scala/is/hail/utils/FunctionDocumentation.scala @@ -49,7 +49,13 @@ object DocumentationEntry { val isMethod = tt.isInstanceOf[MethodType] val isField = tt.isInstanceOf[FieldType] - val argTypes = (if (isMethod || isField) tt.xs.tail else tt.xs).map { t => t.toString.replaceAll("\\?", "").replaceAll("\\(", "").replaceAll("\\)", "") }.toArray + val argTypes = (if (isMethod || isField) tt.xs.tail else tt.xs).map { t => + t match { + case TVariant(_) | TLocus(_) | TInterval(_) => t.toString.replaceAll("\\?", "") + case _ => t.toString.replaceAll("\\?", "").replaceAll("\\(", "").replaceAll("\\)", "") + } + }.toArray + val nArgs = argTypes.length val argNames = if (md.args.nonEmpty) md.args.map(_._1).toArray else ('a' to 'z').take(nArgs).map(_.toString).toArray val argDescs = if (md.args.nonEmpty) md.args.map(_._2).toArray else Array.fill[String](nArgs)(null) @@ -89,11 +95,11 @@ case class DocumentationEntry(name: String, category: String, objType: Option[Ty val objCategory = { objType match { case Some(ot) => ot match { - case x: TAggregable => Some("Aggregable") - case x: TAggregableVariable => Some("Aggregable") - case x: TArray => Some("Array") - case x: TSet => Some("Set") - case x: TDict => Some("Dict") + case TAggregable(_) => Some("Aggregable") + case TAggregableVariable(_, _) => Some("Aggregable") + case TArray(_) => Some("Array") + case TSet(_) => Some("Set") + case TDict(_, _) => Some("Dict") case _ => Some(ot.toString.replaceAll("\\?", "")) } case None => None @@ -173,7 +179,7 @@ case class DocumentationEntry(name: String, category: String, objType: Option[Ty retType match { case rt: TStruct => if (rt.fields.nonEmpty) { - val fields = rt.fields.flatMap(fd => emitField(sb, fd, None)).map(s => "\t" + s) + val fields = rt.fields.flatMap(fd => emitField(sb, fd, None)).map(s => "\t" + s.replaceAll("\\?", "")) val output = (Array(".. container:: annotation\n") ++ fields).map(s => "\t" + s).mkString("\n") sb.append(output) diff --git a/src/main/scala/is/hail/variant/GenomeReference.scala b/src/main/scala/is/hail/variant/GenomeReference.scala index 935f4711d18..dc81175b7aa 100644 --- a/src/main/scala/is/hail/variant/GenomeReference.scala +++ b/src/main/scala/is/hail/variant/GenomeReference.scala @@ -3,7 +3,7 @@ package is.hail.variant import java.io.InputStream import is.hail.check.Gen -import is.hail.expr.JSONExtractGenomeReference +import is.hail.expr.{JSONExtractGenomeReference, TInterval, TLocus, TVariant} import is.hail.utils._ import org.json4s._ import org.json4s.jackson.JsonMethods @@ -12,6 +12,10 @@ import scala.collection.JavaConverters._ import scala.language.implicitConversions abstract class GRBase extends Serializable { + val variant: TVariant = TVariant(this) + val locus: TLocus = TLocus(this) + val interval: TInterval = TInterval(this) + def isValidContig(contig: String): Boolean def contigLength(contig: String): Int @@ -36,13 +40,13 @@ abstract class GRBase extends Serializable { def toJSON: JValue - def unify(concrete: GenomeReference): Boolean + def unify(concrete: GRBase): Boolean def isBound: Boolean def clear(): Unit - def subst(): GenomeReference + def subst(): GRBase } case class GenomeReference(name: String, contigs: Array[String], lengths: Map[String, Int], xContigs: Set[String], @@ -157,20 +161,34 @@ case class GenomeReference(name: String, contigs: Array[String], lengths: Map[St } } - def unify(concrete: GenomeReference): Boolean = this == concrete + def unify(concrete: GRBase): Boolean = this eq concrete def isBound: Boolean = true def clear() {} def subst(): GenomeReference = this + + override def toString: String = name } object GenomeReference { + val GRCh37: GenomeReference = fromResource("reference/grch37.json") + val GRCh38: GenomeReference = fromResource("reference/grch38.json") + var references: Map[String, GenomeReference] = Map("GRCh37" -> GRCh37, "GRCh38" -> GRCh38) + + def addReference(gr: GenomeReference) { + if (references.contains(gr.name)) + fatal(s"Cannot add reference genome. Reference genome `${ gr.name }' already exists.") + references += (gr.name -> gr) + } - def GRCh37: GenomeReference = fromResource("reference/grch37.json") - - def GRCh38: GenomeReference = fromResource("reference/grch38.json") + def getReference(name: String): GenomeReference = { + references.get(name) match { + case Some(gr) => gr + case None => fatal(s"No genome reference with name `$name' exists. Available references: `${ references.keys.mkString(", ") }'.") + } + } def fromJSON(json: JValue): GenomeReference = json.extract[JSONExtractGenomeReference].toGenomeReference @@ -197,16 +215,16 @@ object GenomeReference { par.asScala.toArray) } -case class GRVariable(var gr: GenomeReference = null) extends GRBase { +case class GRVariable(var gr: GRBase = null) extends GRBase { - override def toString = "GenomeReference" + override def toString = "?GR" - def unify(concrete: GenomeReference): Boolean = { + def unify(concrete: GRBase): Boolean = { if (gr == null) { gr = concrete true } else - gr == concrete + gr eq concrete } def isBound: Boolean = gr != null @@ -215,7 +233,7 @@ case class GRVariable(var gr: GenomeReference = null) extends GRBase { gr = null } - def subst(): GenomeReference = { + def subst(): GRBase = { assert(gr != null) gr } diff --git a/src/test/scala/is/hail/methods/AggregatorSuite.scala b/src/test/scala/is/hail/methods/AggregatorSuite.scala index 00401c18bb6..23752b3eb0f 100644 --- a/src/test/scala/is/hail/methods/AggregatorSuite.scala +++ b/src/test/scala/is/hail/methods/AggregatorSuite.scala @@ -246,13 +246,13 @@ class AggregatorSuite extends SparkSuite { vds.annotateVariantsExpr("va = gs.map(g => 5)")) TestUtils.interceptFatal("unrealizable type.*Aggregable\\[Genotype\\]")( vds.annotateVariantsExpr("va = gs.filter(g => true)")) - TestUtils.interceptFatal("unrealizable type.*Aggregable\\[Variant\\]")( + TestUtils.interceptFatal("unrealizable type.*Aggregable\\[Variant\\(GRCh37\\)\\]")( vds.queryVariants("variants") ) TestUtils.interceptFatal("unrealizable type.*Aggregable\\[String\\]")( vds.queryVariants("variants.map(v => v.contig)") ) - TestUtils.interceptFatal("unrealizable type.*Aggregable\\[Variant\\]")( + TestUtils.interceptFatal("unrealizable type.*Aggregable\\[Variant\\(GRCh37\\)\\]")( vds.queryVariants("variants.filter(v => false)") ) TestUtils.interceptFatal("unrealizable type.*Aggregable\\[String\\]")( diff --git a/src/test/scala/is/hail/variant/GenomeReferenceSuite.scala b/src/test/scala/is/hail/variant/GenomeReferenceSuite.scala index bf7358725d6..1e9c65f3e9d 100644 --- a/src/test/scala/is/hail/variant/GenomeReferenceSuite.scala +++ b/src/test/scala/is/hail/variant/GenomeReferenceSuite.scala @@ -1,7 +1,10 @@ package is.hail.variant -import is.hail.{SparkSuite, TestUtils} +import is.hail.expr.{TInterval, TLocus, TStruct, TVariant} +import is.hail.keytable.KeyTable import is.hail.utils.Interval +import is.hail.{SparkSuite, TestUtils} +import org.apache.spark.sql.Row import org.testng.annotations.Test class GenomeReferenceSuite extends SparkSuite { @@ -74,4 +77,27 @@ class GenomeReferenceSuite extends SparkSuite { assert(v.inYNonPar == v.inYNonPar(gr)) } } + + @Test def testParser() { + val gr = GenomeReference("foo", Array("1", "2", "3"), Map("1" -> 5, "2" -> 5, "3" -> 5), + Set.empty[String], Set.empty[String], Set.empty[String], Array.empty[Interval[Locus]]) + GenomeReference.addReference(gr) + + val vds = hc.importVCF("src/test/resources/sample.vcf") + .annotateVariantsExpr("va.v = NA: Variant(foo), va.l = NA: Locus(foo), va.i = NA: Interval(foo)") + + val vas = vds.vaSignature.asInstanceOf[TStruct] + + assert(vas.field("v").typ == TVariant(gr)) + assert(vas.field("l").typ == TLocus(gr)) + assert(vas.field("i").typ == TInterval(gr)) + } + + @Test(enabled = false) def testFuncReg() { + val data = Array(Row(Variant("X", 154931044, "A", "G"), Variant("X", 156030895, "A", "G"))) + val kt = KeyTable(hc, sc.parallelize(data), + TStruct(("v37", TVariant(GenomeReference.GRCh37)), ("v38", TVariant(GenomeReference.GRCh38)))) + + assert(kt.forall("v37.inXPar() && v38.inXPar()")) + } }