Skip to content

Commit

Permalink
Reference Genome #2 (#2080)
Browse files Browse the repository at this point in the history
* Reference Genome #2

- Added methods to `Variant` that take a `GenomeReference` object
- Reconfigured VariantSubgen to take a Gen[Contig]
- Added a Gen[Contig] from a GenomeReference object

* fixed export bug and changed contig gen slightly
  • Loading branch information
jigold authored Aug 9, 2017
1 parent cc1f7e9 commit 6d6b069
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 14 deletions.
2 changes: 1 addition & 1 deletion src/main/scala/is/hail/expr/Parser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ object Parser extends JavaTokenParsers {
(anyFailAllFail(names), types,
() => {
(types, f()).zipped.map { case (t, v) =>
t.str(v)
TableAnnotationImpex.exportAnnotation(v, t)
}
})
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/is/hail/keytable/KeyTable.scala
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ case class KeyTable(hc: HailContext, rdd: RDD[Row],
sb.clear()

localTypes.indices.foreachBetween { i =>
sb.append(localTypes(i).str(r.get(i)))
sb.append(TableAnnotationImpex.exportAnnotation(r.get(i), localTypes(i)))
}(sb += '\t')

sb.result()
Expand Down
63 changes: 51 additions & 12 deletions src/main/scala/is/hail/variant/Variant.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,12 @@ object Contig {

def gen: Gen[Contig] = for {
name <- Gen.identifier
length <- Gen.posInt
length <- Gen.choose(1000000, 500000000)
} yield Contig(name, length)

def gen(gr: GenomeReference): Gen[Contig] = for {
contig <- Gen.oneOfSeq(gr.contigs)
} yield contig
}

case class Contig(name: String, length: Int) {
Expand Down Expand Up @@ -97,8 +101,8 @@ case class AltAllele(ref: String,

def isStar: Boolean = alt == "*"

def isSNP: Boolean = !isStar && ( (ref.length == 1 && alt.length == 1) ||
(ref.length == alt.length && nMismatch == 1) )
def isSNP: Boolean = !isStar && ((ref.length == 1 && alt.length == 1) ||
(ref.length == alt.length && nMismatch == 1))

def isMNP: Boolean = ref.length > 1 &&
ref.length == alt.length &&
Expand Down Expand Up @@ -232,37 +236,43 @@ object Variant {

object VariantSubgen {
val random = VariantSubgen(
contigGen = Gen.identifier,
startGen = Gen.posInt,
contigGen = Contig.gen,
nAllelesGen = Gen.frequency((5, Gen.const(2)), (1, Gen.choose(2, 10))),
refGen = genDNAString,
altGen = Gen.frequency((10, genDNAString),
(1, Gen.const("*"))))

val plinkCompatible = random.copy(
contigGen = Gen.choose(1, 22).map(_.toString)
)
val plinkCompatible = {
val contigGen = for {
name <- Gen.choose(1, 22).map(_.toString)
length <- Gen.choose(1000000, 500000000)
} yield Contig(name, length)

random.copy(contigGen = contigGen)
}

val biallelic = random.copy(nAllelesGen = Gen.const(2))

def fromGenomeRef(gr: GenomeReference): VariantSubgen =
random.copy(contigGen = Contig.gen(gr))
}

case class VariantSubgen(
contigGen: Gen[String],
startGen: Gen[Int],
contigGen: Gen[Contig],
nAllelesGen: Gen[Int],
refGen: Gen[String],
altGen: Gen[String]) {

def gen: Gen[Variant] =
for (contig <- contigGen;
start <- startGen;
start <- Gen.choose(1, contig.length);
nAlleles <- nAllelesGen;
ref <- refGen;
altAlleles <- Gen.distinctBuildableOfN[Array, String](
nAlleles,
altGen)
.filter(!_.contains(ref))) yield
Variant(contig, start, ref, altAlleles.tail.map(alt => AltAllele(ref, alt)))
Variant(contig.name, start, ref, altAlleles.tail.map(alt => AltAllele(ref, alt)))
}

case class Variant(contig: String,
Expand Down Expand Up @@ -301,13 +311,19 @@ case class Variant(contig: String,
def isAutosomalOrPseudoAutosomal: Boolean =
isAutosomal || inXPar || inYPar

def isAutosomalOrPseudoAutosomal(gr: GenomeReference): Boolean = isAutosomal(gr) || inXPar(gr) || inYPar(gr)

def isAutosomal = !(inX || inY || isMitochondrial)

def isAutosomal(gr: GenomeReference): Boolean = !(inX(gr) || inY(gr) || isMitochondrial(gr))

def isMitochondrial = {
val c = contig.toUpperCase
c == "MT" || c == "M" || c == "26"
}

def isMitochondrial(gr: GenomeReference): Boolean = gr.isMitochondrial(contig)

// PAR regions of sex chromosomes: https://en.wikipedia.org/wiki/Pseudoautosomal_region
// Boundaries for build GRCh37: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/
def inXParPos: Boolean = (60001 <= start && start <= 2699520) || (154931044 <= start && start <= 155260560)
Expand All @@ -317,16 +333,28 @@ case class Variant(contig: String,
// FIXME: will replace with contig == "X" etc once bgen/plink support is merged and conversion is handled by import
def inXPar: Boolean = inX && inXParPos

def inXPar(gr: GenomeReference): Boolean = gr.inXPar(locus)

def inYPar: Boolean = inY && inYParPos

def inYPar(gr: GenomeReference): Boolean = gr.inYPar(locus)

def inXNonPar: Boolean = inX && !inXParPos

def inXNonPar(gr: GenomeReference): Boolean = inX(gr) && !inXPar(gr)

def inYNonPar: Boolean = inY && !inYParPos

def inYNonPar(gr: GenomeReference): Boolean = inY(gr) && !inYPar(gr)

private def inX: Boolean = contig.toUpperCase == "X" || contig == "23" || contig == "25"

private def inX(gr: GenomeReference): Boolean = gr.inX(contig)

private def inY: Boolean = contig.toUpperCase == "Y" || contig == "24"

private def inY(gr: GenomeReference): Boolean = gr.inY(contig)

import CopyState._

def copyState(sex: Sex.Sex): CopyState =
Expand All @@ -340,6 +368,17 @@ case class Variant(contig: String,
else
Auto

def copyState(sex: Sex.Sex, gr: GenomeReference): CopyState =
if (sex == Sex.Male)
if (inXNonPar(gr))
HemiX
else if (inYNonPar(gr))
HemiY
else
Auto
else
Auto

def compare(that: Variant): Int = {
var c = Contig.compare(contig, that.contig)
if (c != 0)
Expand Down
21 changes: 21 additions & 0 deletions src/test/scala/is/hail/variant/GenomeReferenceSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,25 @@ class GenomeReferenceSuite extends SparkSuite {
intercept[IllegalArgumentException](GenomeReference("test", Array(Contig("1", 5), Contig("2", 5), Contig("3", 5)),
Set.empty[String], Set.empty[String], Set("MT"), Array(Interval(Locus("X", 1), Locus("X", 5)))))
}

@Test def testVariant() {
val gr = GenomeReference.GRCh37

val v1 = Variant("1", 50000, "A", "T")
val v2 = Variant("X", 2499520, "T", "G")
val v3 = Variant("Y", 50001, "G", "C")
val v4 = Variant("MT", 30, "T", "G")
val v5 = Variant("X", 50, "G", "A")
val v6 = Variant("Y", 5000, "C", "T")

for (v <- Array(v1, v2, v3, v4, v5, v6)) {
assert(v.isAutosomal == v.isAutosomal(gr))
assert(v.isAutosomalOrPseudoAutosomal == v.isAutosomalOrPseudoAutosomal(gr))
assert(v.isMitochondrial == v.isMitochondrial(gr))
assert(v.inXPar == v.inXPar(gr))
assert(v.inYPar == v.inYPar(gr))
assert(v.inXNonPar == v.inXNonPar(gr))
assert(v.inYNonPar == v.inYNonPar(gr))
}
}
}

0 comments on commit 6d6b069

Please sign in to comment.