diff --git a/src/main/scala/is/hail/expr/Parser.scala b/src/main/scala/is/hail/expr/Parser.scala index 34b10df77fd..63c5d07c69f 100644 --- a/src/main/scala/is/hail/expr/Parser.scala +++ b/src/main/scala/is/hail/expr/Parser.scala @@ -141,7 +141,7 @@ object Parser extends JavaTokenParsers { (anyFailAllFail(names), types, () => { (types, f()).zipped.map { case (t, v) => - t.str(v) + TableAnnotationImpex.exportAnnotation(v, t) } }) } diff --git a/src/main/scala/is/hail/keytable/KeyTable.scala b/src/main/scala/is/hail/keytable/KeyTable.scala index 125fedc0188..65664c8a9f8 100644 --- a/src/main/scala/is/hail/keytable/KeyTable.scala +++ b/src/main/scala/is/hail/keytable/KeyTable.scala @@ -494,7 +494,7 @@ case class KeyTable(hc: HailContext, rdd: RDD[Row], sb.clear() localTypes.indices.foreachBetween { i => - sb.append(localTypes(i).str(r.get(i))) + sb.append(TableAnnotationImpex.exportAnnotation(r.get(i), localTypes(i))) }(sb += '\t') sb.result() diff --git a/src/main/scala/is/hail/variant/Variant.scala b/src/main/scala/is/hail/variant/Variant.scala index 96d3d3afeb3..022c8db45b3 100644 --- a/src/main/scala/is/hail/variant/Variant.scala +++ b/src/main/scala/is/hail/variant/Variant.scala @@ -29,8 +29,12 @@ object Contig { def gen: Gen[Contig] = for { name <- Gen.identifier - length <- Gen.posInt + length <- Gen.choose(1000000, 500000000) } yield Contig(name, length) + + def gen(gr: GenomeReference): Gen[Contig] = for { + contig <- Gen.oneOfSeq(gr.contigs) + } yield contig } case class Contig(name: String, length: Int) { @@ -97,8 +101,8 @@ case class AltAllele(ref: String, def isStar: Boolean = alt == "*" - def isSNP: Boolean = !isStar && ( (ref.length == 1 && alt.length == 1) || - (ref.length == alt.length && nMismatch == 1) ) + def isSNP: Boolean = !isStar && ((ref.length == 1 && alt.length == 1) || + (ref.length == alt.length && nMismatch == 1)) def isMNP: Boolean = ref.length > 1 && ref.length == alt.length && @@ -232,37 +236,43 @@ object Variant { object VariantSubgen { val random = VariantSubgen( - contigGen = Gen.identifier, - startGen = Gen.posInt, + contigGen = Contig.gen, nAllelesGen = Gen.frequency((5, Gen.const(2)), (1, Gen.choose(2, 10))), refGen = genDNAString, altGen = Gen.frequency((10, genDNAString), (1, Gen.const("*")))) - val plinkCompatible = random.copy( - contigGen = Gen.choose(1, 22).map(_.toString) - ) + val plinkCompatible = { + val contigGen = for { + name <- Gen.choose(1, 22).map(_.toString) + length <- Gen.choose(1000000, 500000000) + } yield Contig(name, length) + + random.copy(contigGen = contigGen) + } val biallelic = random.copy(nAllelesGen = Gen.const(2)) + + def fromGenomeRef(gr: GenomeReference): VariantSubgen = + random.copy(contigGen = Contig.gen(gr)) } case class VariantSubgen( - contigGen: Gen[String], - startGen: Gen[Int], + contigGen: Gen[Contig], nAllelesGen: Gen[Int], refGen: Gen[String], altGen: Gen[String]) { def gen: Gen[Variant] = for (contig <- contigGen; - start <- startGen; + start <- Gen.choose(1, contig.length); nAlleles <- nAllelesGen; ref <- refGen; altAlleles <- Gen.distinctBuildableOfN[Array, String]( nAlleles, altGen) .filter(!_.contains(ref))) yield - Variant(contig, start, ref, altAlleles.tail.map(alt => AltAllele(ref, alt))) + Variant(contig.name, start, ref, altAlleles.tail.map(alt => AltAllele(ref, alt))) } case class Variant(contig: String, @@ -301,13 +311,19 @@ case class Variant(contig: String, def isAutosomalOrPseudoAutosomal: Boolean = isAutosomal || inXPar || inYPar + def isAutosomalOrPseudoAutosomal(gr: GenomeReference): Boolean = isAutosomal(gr) || inXPar(gr) || inYPar(gr) + def isAutosomal = !(inX || inY || isMitochondrial) + def isAutosomal(gr: GenomeReference): Boolean = !(inX(gr) || inY(gr) || isMitochondrial(gr)) + def isMitochondrial = { val c = contig.toUpperCase c == "MT" || c == "M" || c == "26" } + def isMitochondrial(gr: GenomeReference): Boolean = gr.isMitochondrial(contig) + // PAR regions of sex chromosomes: https://en.wikipedia.org/wiki/Pseudoautosomal_region // Boundaries for build GRCh37: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/ def inXParPos: Boolean = (60001 <= start && start <= 2699520) || (154931044 <= start && start <= 155260560) @@ -317,16 +333,28 @@ case class Variant(contig: String, // FIXME: will replace with contig == "X" etc once bgen/plink support is merged and conversion is handled by import def inXPar: Boolean = inX && inXParPos + def inXPar(gr: GenomeReference): Boolean = gr.inXPar(locus) + def inYPar: Boolean = inY && inYParPos + def inYPar(gr: GenomeReference): Boolean = gr.inYPar(locus) + def inXNonPar: Boolean = inX && !inXParPos + def inXNonPar(gr: GenomeReference): Boolean = inX(gr) && !inXPar(gr) + def inYNonPar: Boolean = inY && !inYParPos + def inYNonPar(gr: GenomeReference): Boolean = inY(gr) && !inYPar(gr) + private def inX: Boolean = contig.toUpperCase == "X" || contig == "23" || contig == "25" + private def inX(gr: GenomeReference): Boolean = gr.inX(contig) + private def inY: Boolean = contig.toUpperCase == "Y" || contig == "24" + private def inY(gr: GenomeReference): Boolean = gr.inY(contig) + import CopyState._ def copyState(sex: Sex.Sex): CopyState = @@ -340,6 +368,17 @@ case class Variant(contig: String, else Auto + def copyState(sex: Sex.Sex, gr: GenomeReference): CopyState = + if (sex == Sex.Male) + if (inXNonPar(gr)) + HemiX + else if (inYNonPar(gr)) + HemiY + else + Auto + else + Auto + def compare(that: Variant): Int = { var c = Contig.compare(contig, that.contig) if (c != 0) diff --git a/src/test/scala/is/hail/variant/GenomeReferenceSuite.scala b/src/test/scala/is/hail/variant/GenomeReferenceSuite.scala index cbd6b191aef..a33158e1385 100644 --- a/src/test/scala/is/hail/variant/GenomeReferenceSuite.scala +++ b/src/test/scala/is/hail/variant/GenomeReferenceSuite.scala @@ -47,4 +47,25 @@ class GenomeReferenceSuite extends SparkSuite { intercept[IllegalArgumentException](GenomeReference("test", Array(Contig("1", 5), Contig("2", 5), Contig("3", 5)), Set.empty[String], Set.empty[String], Set("MT"), Array(Interval(Locus("X", 1), Locus("X", 5))))) } + + @Test def testVariant() { + val gr = GenomeReference.GRCh37 + + val v1 = Variant("1", 50000, "A", "T") + val v2 = Variant("X", 2499520, "T", "G") + val v3 = Variant("Y", 50001, "G", "C") + val v4 = Variant("MT", 30, "T", "G") + val v5 = Variant("X", 50, "G", "A") + val v6 = Variant("Y", 5000, "C", "T") + + for (v <- Array(v1, v2, v3, v4, v5, v6)) { + assert(v.isAutosomal == v.isAutosomal(gr)) + assert(v.isAutosomalOrPseudoAutosomal == v.isAutosomalOrPseudoAutosomal(gr)) + assert(v.isMitochondrial == v.isMitochondrial(gr)) + assert(v.inXPar == v.inXPar(gr)) + assert(v.inYPar == v.inYPar(gr)) + assert(v.inXNonPar == v.inXNonPar(gr)) + assert(v.inYNonPar == v.inYNonPar(gr)) + } + } }