Skip to content

Commit

Permalink
Reference Genome #4: Python interfaces (hail-is#2090)
Browse files Browse the repository at this point in the history
* Reference Genome #4: Python interfaces

- Added GenomeReference and Contig classes in Python
- Tests
- Documentation

* fixed test

* addressing comments

* changed interface

* addressed comments

* addressed comments

* changed name of python script

* fix compile error

* Revert "fix compile error"

This reverts commit eb98237.

* Revert "changed name of python script"

This reverts commit 0737b49.

* Revert "addressed comments"

This reverts commit 9601c23.

* Revert "addressed comments"

This reverts commit ee6527f.

* Addressed comments

* addressed comments
  • Loading branch information
jigold authored Sep 6, 2017
1 parent 10daedc commit 8170217
Show file tree
Hide file tree
Showing 9 changed files with 364 additions and 81 deletions.
2 changes: 1 addition & 1 deletion python/hail/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2793,7 +2793,7 @@ def ld_matrix(self, force_local=False):

jldm = self._jvdf.ldMatrix(force_local)
return LDMatrix(jldm)

@handle_py4j
@record_method
@typecheck_method(key_name=strlike,
Expand Down
9 changes: 5 additions & 4 deletions python/hail/docs/representation/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ representation
:toctree: ./
:template: class.rst

hail.representation.Variant
hail.representation.AltAllele
hail.representation.Genotype
hail.representation.Call
hail.representation.Locus
hail.representation.GenomeReference
hail.representation.Genotype
hail.representation.Interval
hail.representation.Trio
hail.representation.Locus
hail.representation.Pedigree
hail.representation.Struct
hail.representation.Trio
hail.representation.Variant
4 changes: 3 additions & 1 deletion python/hail/representation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from hail.representation.genotype import Genotype, Call
from hail.representation.annotations import Struct
from hail.representation.pedigree import Trio, Pedigree
from hail.representation.genomeref import GenomeReference

__all__ = ['Variant',
'Locus',
Expand All @@ -12,4 +13,5 @@
'Struct',
'Call',
'Pedigree',
'Trio']
'Trio',
'GenomeReference']
203 changes: 203 additions & 0 deletions python/hail/representation/genomeref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
from hail.java import handle_py4j, jiterable_to_list
from hail.typecheck import *
from hail.representation.interval import Interval
from hail.utils import wrap_to_list
from hail.history import *


class GenomeReference(HistoryMixin):
"""An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__.
:param str name: Name of reference.
:param contigs: Contig names.
:type contigs: list of str
:param lengths: Dict of contig names to contig lengths.
:type lengths: dict of str to int
:param x_contigs: Contigs to be treated as X chromosomes.
:type x_contigs: str or list of str
:param y_contigs: Contigs to be treated as Y chromosomes.
:type y_contigs: str or list of str
:param mt_contigs: Contigs to be treated as mitochondrial DNA.
:type mt_contigs: str or list of str
:param par: List of intervals representing pseudoautosomal regions.
:type par: list of :class:`.Interval`
>>> contigs = ["1", "X", "Y", "MT"]
>>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569}
>>> par = [Interval.parse("X:60001-2699521")]
>>> my_ref = GenomeReference("my_ref", contigs, lengths, "X", "Y", "MT", par)
"""

@handle_py4j
@record_init
@typecheck_method(name=strlike,
contigs=listof(strlike),
lengths=dictof(strlike, integral),
x_contigs=oneof(strlike, listof(strlike)),
y_contigs=oneof(strlike, listof(strlike)),
mt_contigs=oneof(strlike, listof(strlike)),
par=listof(Interval))
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]):
contigs = wrap_to_list(contigs)
x_contigs = wrap_to_list(x_contigs)
y_contigs = wrap_to_list(y_contigs)
mt_contigs = wrap_to_list(mt_contigs)
par_jrep = [interval._jrep for interval in par]

jrep = (Env.hail().variant.GenomeReference
.apply(name,
contigs,
lengths,
x_contigs,
y_contigs,
mt_contigs,
par_jrep))

self._init_from_java(jrep)
self._name = name
self._contigs = contigs
self._lengths = lengths
self._x_contigs = x_contigs
self._y_contigs = y_contigs
self._mt_contigs = mt_contigs
self._par = par

super(GenomeReference, self).__init__()

@handle_py4j
def __str__(self):
return self._jrep.toString()

def __repr__(self):
return 'GenomeReference(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \
(self.name, self.contigs, self.lengths, self.x_contigs, self.y_contigs, self.mt_contigs, self.par)

@handle_py4j
def __eq__(self, other):
return self._jrep.equals(other._jrep)

@handle_py4j
def __hash__(self):
return self._jrep.hashCode()

@property
def name(self):
"""Name of genome reference.
:rtype: str
"""
return self._name

@property
def contigs(self):
"""Contig names.
:rtype: list of str
"""
return self._contigs

@property
def lengths(self):
"""Dict of contig name to contig length.
:rtype: dict of str to int
"""
return self._lengths

@property
def x_contigs(self):
"""X contigs.
:rtype: list of str
"""
return self._x_contigs

@property
def y_contigs(self):
"""Y contigs.
:rtype: list of str
"""
return self._y_contigs

@property
def mt_contigs(self):
"""Mitochondrial contigs.
:rtype: list of str
"""
return self._mt_contigs

@property
def par(self):
"""Pseudoautosomal regions.
:rtype: list of :class:`.Interval`
"""
return self._par

@typecheck_method(contig=strlike)
def contig_length(self, contig):
"""Contig length.
:param contig: Contig
:type contig: str
:return: Length of contig.
:rtype: int
"""
if contig in self._lengths:
return self._lengths[contig]
else:
raise KeyError("Contig `{}' is not in reference genome.".format(contig))

@classmethod
@record_classmethod
@handle_py4j
def GRCh37(cls):
"""Reference genome for GRCh37.
Data from `GATK resource bundle <ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37.dict>`__.
>>> grch37 = GenomeReference.GRCh37()
:rtype: :class:`.GenomeReference`
"""
return GenomeReference._from_java(Env.hail().variant.GenomeReference.GRCh37())

@classmethod
@record_classmethod
@handle_py4j
def GRCh38(cls):
"""Reference genome for GRCh38.
Data from `GATK resource bundle <ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/Homo_sapiens_assembly38.dict>`__.
>>> grch38 = GenomeReference.GRCh38()
:rtype: :class:`.GenomeReference`
"""
return GenomeReference._from_java(Env.hail().variant.GenomeReference.GRCh38())

@handle_py4j
def _init_from_java(self, jrep):
self._jrep = jrep

@classmethod
def _from_java(cls, jrep):
gr = GenomeReference.__new__(cls)
gr._init_from_java(jrep)
gr._name = jrep.name()
gr._contigs = [str(x) for x in jrep.contigs()]
gr._lengths = {str(x._1()): int(x._2()) for x in jiterable_to_list(jrep.lengths())}
gr._x_contigs = [str(x) for x in jiterable_to_list(jrep.xContigs())]
gr._y_contigs = [str(x) for x in jiterable_to_list(jrep.yContigs())]
gr._mt_contigs = [str(x) for x in jiterable_to_list(jrep.mtContigs())]
gr._par = [Interval._from_java(x) for x in jrep.par()]
return gr
27 changes: 27 additions & 0 deletions python/hail/tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,33 @@ def test_representation(self):
self.assertFalse(c_nocall.is_het_non_ref())
self.assertFalse(c_nocall.is_het_ref())

gr = GenomeReference.GRCh37()
self.assertEqual(gr.name, "GRCh37")
self.assertEqual(gr.contigs[0], "1")
self.assertListEqual(gr.x_contigs, ["X"])
self.assertListEqual(gr.y_contigs, ["Y"])
self.assertListEqual(gr.mt_contigs, ["MT"])
self.assertEqual(gr.par[0], Interval.parse("X:60001-2699521"))
self.assertEqual(gr.contig_length("1"), 249250621)

name = "test"
contigs = ["1", "X", "Y", "MT"]
lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000}
x_contigs = ["X"]
y_contigs = ["Y"]
mt_contigs = ["MT"]
par = [Interval(Locus("X", 5), Locus("X", 1000))]

gr2 = GenomeReference(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par)
self.assertEqual(gr2.name, name)
self.assertListEqual(gr2.contigs, contigs)
self.assertListEqual(gr2.x_contigs, x_contigs)
self.assertListEqual(gr2.y_contigs, y_contigs)
self.assertListEqual(gr2.mt_contigs, mt_contigs)
self.assertEqual(gr2.par, par)
self.assertEqual(gr2.contig_length("1"), 10000)
self.assertDictEqual(gr2.lengths, lengths)

def test_types(self):
self.assertEqual(TInt32(), TInt32())
self.assertEqual(TFloat64(), TFloat64())
Expand Down
9 changes: 6 additions & 3 deletions src/main/scala/is/hail/expr/AnnotationImpex.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package is.hail.expr

import is.hail.annotations.Annotation
import is.hail.utils.{Interval, _}
import is.hail.variant.{AltAllele, Contig, GenomeReference, Genotype, Locus, Sample, Variant}
import is.hail.variant.{AltAllele, GenomeReference, Genotype, Locus, Variant}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.json4s._
Expand Down Expand Up @@ -215,10 +215,13 @@ case class JSONExtractInterval(start: Locus, end: Locus) {
def toInterval = Interval(start, end)
}

case class JSONExtractGenomeReference(name: String, contigs: Array[Contig], xContigs: Set[String],
case class JSONExtractContig(name: String, length: Int)

case class JSONExtractGenomeReference(name: String, contigs: Array[JSONExtractContig], xContigs: Set[String],
yContigs: Set[String], mtContigs: Set[String], par: Array[JSONExtractInterval]) {

def toGenomeReference: GenomeReference = GenomeReference(name, contigs, xContigs, yContigs, mtContigs, par.map(_.toInterval))
def toGenomeReference: GenomeReference = GenomeReference(name, contigs.map(_.name),
contigs.map(c => (c.name, c.length)).toMap, xContigs, yContigs, mtContigs, par.map(_.toInterval))
}

object JSONAnnotationImpex extends AnnotationImpex[Type, JValue] {
Expand Down
Loading

0 comments on commit 8170217

Please sign in to comment.