Skip to content

Commit

Permalink
addressed comments
Browse files Browse the repository at this point in the history
  • Loading branch information
jigold committed Aug 29, 2017
1 parent b41e8f2 commit ee6527f
Show file tree
Hide file tree
Showing 32 changed files with 280 additions and 270 deletions.
2 changes: 1 addition & 1 deletion python/hail/docs/representation/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ representation

hail.representation.AltAllele
hail.representation.Call
hail.representation.GenomeReference
hail.representation.ReferenceGenome
hail.representation.Genotype
hail.representation.Interval
hail.representation.Locus
Expand Down
6 changes: 3 additions & 3 deletions python/hail/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ class TVariant(Type):
"""

def __init__(self):
jtype = scala_object(Env.hail().expr, 'TVariant').apply(Env.hail().variant.GenomeReference.GRCh37())
jtype = scala_object(Env.hail().expr, 'TVariant').apply(Env.hail().variant.ReferenceGenome.GRCh37())
super(TVariant, self).__init__(jtype)

@classmethod
Expand Down Expand Up @@ -721,7 +721,7 @@ class TLocus(Type):
"""

def __init__(self):
jtype = scala_object(Env.hail().expr, 'TLocus').apply(Env.hail().variant.GenomeReference.GRCh37())
jtype = scala_object(Env.hail().expr, 'TLocus').apply(Env.hail().variant.ReferenceGenome.GRCh37())
super(TLocus, self).__init__(jtype)

@classmethod
Expand Down Expand Up @@ -763,7 +763,7 @@ class TInterval(Type):
"""

def __init__(self):
jtype = scala_object(Env.hail().expr, 'TInterval').apply(Env.hail().variant.GenomeReference.GRCh37())
jtype = scala_object(Env.hail().expr, 'TInterval').apply(Env.hail().variant.ReferenceGenome.GRCh37())
super(TInterval, self).__init__(jtype)

@classmethod
Expand Down
4 changes: 2 additions & 2 deletions python/hail/representation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from hail.representation.genotype import Genotype, Call
from hail.representation.annotations import Struct
from hail.representation.pedigree import Trio, Pedigree
from hail.representation.genomeref import GenomeReference
from hail.representation.genomeref import ReferenceGenome

__all__ = ['Variant',
'Locus',
Expand All @@ -14,4 +14,4 @@
'Call',
'Pedigree',
'Trio',
'GenomeReference']
'ReferenceGenome']
85 changes: 43 additions & 42 deletions python/hail/representation/genomeref.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,33 @@
from hail.history import *


class GenomeReference(HistoryMixin):
"""An object that represents a genome reference.
class ReferenceGenome(HistoryMixin):
"""An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`_.
:param str name: Name of reference
:param str name: Name of reference.
:param contigs: Contig names
:param contigs: Contig names.
:type contigs: list of str
:param lengths: Dict of contig names to contig lengths
:param lengths: Dict of contig names to contig lengths.
:type lengths: dict of str to int
:param x_contigs: Contigs to be treated as X chromosomes
:param x_contigs: Contigs to be treated as X chromosomes.
:type x_contigs: str or list of str
:param y_contigs: Contigs to be treated as Y chromosomes
:param y_contigs: Contigs to be treated as Y chromosomes.
:type y_contigs: str or list of str
:param mt_contigs: Contigs to be treated as mitochondrial chromosomes
:param mt_contigs: Contigs to be treated as mitochondrial DNA.
:type mt_contigs: str or list of str
:param par: List of intervals representing pseudoautosomal regions
:param par: List of intervals representing pseudoautosomal regions.
:type par: list of :class:`.Interval`
>>> contigs = ["1", "X", "Y", "MT"]
>>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569}
>>> par = [Interval.parse("X:60001-2699521")]
>>> my_gr = GenomeReference("my_gr", contigs, lengths, "X", "Y", "MT", par)
>>> my_ref = ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par)
"""

@handle_py4j
Expand All @@ -50,7 +50,7 @@ def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contig
mt_contigs = wrap_to_list(mt_contigs)
par_jrep = [interval._jrep for interval in par]

jrep = (Env.hail().variant.GenomeReference
jrep = (Env.hail().variant.ReferenceGenome
.apply(name,
contigs,
lengths,
Expand All @@ -68,14 +68,14 @@ def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contig
self._mt_contigs = mt_contigs
self._par = par

super(GenomeReference, self).__init__()
super(ReferenceGenome, self).__init__()

@handle_py4j
def __str__(self):
return self._jrep.toString()

def __repr__(self):
return 'GenomeReference(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \
return 'ReferenceGenome(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \
(self.name, self.contigs, self.lengths, self.x_contigs, self.y_contigs, self.mt_contigs, self.par)

@handle_py4j
Expand All @@ -88,67 +88,68 @@ def __hash__(self):

@property
def name(self):
"""Name of genome reference
"""Name of reference genome.
:rtype: str
"""
return self._name

@property
def contigs(self):
"""Contig names
"""Contig names.
:rtype: list of str
"""
return self._contigs

@property
def lengths(self):
"""Map of contig name to contig length
"""Dict of contig name to contig length.
:rtype: dict of str to int
"""
return self._lengths

@property
def x_contigs(self):
"""X contigs
"""X contigs.
:rtype: list of str
"""
return self._x_contigs

@property
def y_contigs(self):
"""Y contigs
"""Y contigs.
:rtype: list of str
"""
return self._y_contigs

@property
def mt_contigs(self):
"""Mitochondrial contigs
"""Mitochondrial contigs.
:rtype: list of str
"""
return self._mt_contigs

@property
def par(self):
"""Pseudoautosomal regions
"""Pseudoautosomal regions.
:rtype: list of :class:`.Interval`
"""
return self._par

@typecheck_method(contig=strlike)
def contig_length(self, contig):
"""Contig length
"""Contig length.
:param contig: Contig to get length of
:param contig: Contig
:type contig: str
:return: Length of contig
:rtype: int
"""
return self._jrep.contigLength(contig)
Expand All @@ -157,43 +158,43 @@ def contig_length(self, contig):
@record_classmethod
@handle_py4j
def GRCh37(cls):
"""Genome reference for GRCh37
"""Reference genome for GRCh37.
Data from `<ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37.dict>`_
Data from `GATK resource bundle <ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37.dict>`_.
>>> gr37 = GenomeReference.GRCh37()
>>> grch37 = ReferenceGenome.GRCh37()
:rtype: :class:`.GenomeReference`
:rtype: :class:`.ReferenceGenome`
"""
return GenomeReference._from_java(Env.hail().variant.GenomeReference.GRCh37())
return ReferenceGenome._from_java(Env.hail().variant.ReferenceGenome.GRCh37())

@classmethod
@record_classmethod
@handle_py4j
def GRCh38(cls):
"""Genome reference for GRCh38
"""Reference genome for GRCh38.
Data from `<ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/Homo_sapiens_assembly38.dict>`_
Data from `GATK resource bundle <ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/Homo_sapiens_assembly38.dict>`_.
>>> gr38 = GenomeReference.GRCh38()
>>> grch38 = ReferenceGenome.GRCh38()
:rtype: :class:`.GenomeReference`
:rtype: :class:`.ReferenceGenome`
"""
return GenomeReference._from_java(Env.hail().variant.GenomeReference.GRCh38())
return ReferenceGenome._from_java(Env.hail().variant.ReferenceGenome.GRCh38())

@handle_py4j
def _init_from_java(self, jrep):
self._jrep = jrep

@classmethod
def _from_java(cls, jrep):
gr = GenomeReference.__new__(cls)
gr._init_from_java(jrep)
gr._name = jrep.name()
gr._contigs = [str(x) for x in jrep.contigs()]
gr._lengths = {str(x._1()): int(x._2()) for x in jiterable_to_list(jrep.lengths())}
gr._x_contigs = [str(x) for x in jiterable_to_list(jrep.xContigs())]
gr._y_contigs = [str(x) for x in jiterable_to_list(jrep.yContigs())]
gr._mt_contigs = [str(x) for x in jiterable_to_list(jrep.mtContigs())]
gr._par = [Interval._from_java(x) for x in jrep.par()]
return gr
rg = ReferenceGenome.__new__(cls)
rg._init_from_java(jrep)
rg._name = jrep.name()
rg._contigs = [str(x) for x in jrep.contigs()]
rg._lengths = {str(x._1()): int(x._2()) for x in jiterable_to_list(jrep.lengths())}
rg._x_contigs = [str(x) for x in jiterable_to_list(jrep.xContigs())]
rg._y_contigs = [str(x) for x in jiterable_to_list(jrep.yContigs())]
rg._mt_contigs = [str(x) for x in jiterable_to_list(jrep.mtContigs())]
rg._par = [Interval._from_java(x) for x in jrep.par()]
return rg
34 changes: 17 additions & 17 deletions python/hail/tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,14 +730,14 @@ def test_representation(self):
self.assertFalse(c_nocall.is_het_non_ref())
self.assertFalse(c_nocall.is_het_ref())

gr = GenomeReference.GRCh37()
self.assertEqual(gr.name, "GRCh37")
self.assertEqual(gr.contigs[0], "1")
self.assertListEqual(gr.x_contigs, ["X"])
self.assertListEqual(gr.y_contigs, ["Y"])
self.assertListEqual(gr.mt_contigs, ["MT"])
self.assertEqual(gr.par[0], Interval.parse("X:60001-2699521"))
self.assertEqual(gr.contig_length("1"), 249250621)
rg = ReferenceGenome.GRCh37()
self.assertEqual(rg.name, "GRCh37")
self.assertEqual(rg.contigs[0], "1")
self.assertListEqual(rg.x_contigs, ["X"])
self.assertListEqual(rg.y_contigs, ["Y"])
self.assertListEqual(rg.mt_contigs, ["MT"])
self.assertEqual(rg.par[0], Interval.parse("X:60001-2699521"))
self.assertEqual(rg.contig_length("1"), 249250621)

name = "test"
contigs = ["1", "X", "Y", "MT"]
Expand All @@ -747,15 +747,15 @@ def test_representation(self):
mt_contigs = ["MT"]
par = [Interval(Locus("X", 5), Locus("X", 1000))]

gr2 = GenomeReference(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par)
self.assertEqual(gr2.name, name)
self.assertListEqual(gr2.contigs, contigs)
self.assertListEqual(gr2.x_contigs, x_contigs)
self.assertListEqual(gr2.y_contigs, y_contigs)
self.assertListEqual(gr2.mt_contigs, mt_contigs)
self.assertEqual(gr2.par, par)
self.assertEqual(gr2.contig_length("1"), 10000)
self.assertDictEqual(gr2.lengths, lengths)
rg2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par)
self.assertEqual(rg2.name, name)
self.assertListEqual(rg2.contigs, contigs)
self.assertListEqual(rg2.x_contigs, x_contigs)
self.assertListEqual(rg2.y_contigs, y_contigs)
self.assertListEqual(rg2.mt_contigs, mt_contigs)
self.assertEqual(rg2.par, par)
self.assertEqual(rg2.contig_length("1"), 10000)
self.assertDictEqual(rg2.lengths, lengths)

def test_types(self):
self.assertEqual(TInt32(), TInt32())
Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/is/hail/HailContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import is.hail.io.vcf._
import is.hail.keytable.KeyTable
import is.hail.stats.{BaldingNicholsModel, Distribution, UniformDist}
import is.hail.utils.{log, _}
import is.hail.variant.{GenericDataset, GenomeReference, Genotype, Locus, VSMFileMetadata, VSMSubgen, Variant, VariantDataset, VariantSampleMatrix}
import is.hail.variant.{GenericDataset, ReferenceGenome, Genotype, Locus, VSMFileMetadata, VSMSubgen, Variant, VariantDataset, VariantSampleMatrix}
import org.apache.hadoop
import org.apache.log4j.{ConsoleAppender, LogManager, PatternLayout, PropertyConfigurator}
import org.apache.spark.deploy.SparkHadoopUtil
Expand Down Expand Up @@ -296,7 +296,7 @@ class HailContext private(val sc: SparkContext,

val signature = TStruct("rsid" -> TString, "varid" -> TString)

val rdd = sc.union(results.map(_.rdd)).toOrderedRDD(TVariant(GenomeReference.GRCh37).orderedKey, classTag[(Annotation, Iterable[Annotation])])
val rdd = sc.union(results.map(_.rdd)).toOrderedRDD(TVariant(ReferenceGenome.GRCh37).orderedKey, classTag[(Annotation, Iterable[Annotation])])

new GenericDataset(this,
VSMFileMetadata(samples,
Expand Down Expand Up @@ -575,7 +575,7 @@ class HailContext private(val sc: SparkContext,

def eval(expr: String): (Annotation, Type) = {
val ec = EvalContext(
"v" -> TVariant(GenomeReference.GRCh37),
"v" -> TVariant(ReferenceGenome.GRCh37),
"s" -> TString,
"g" -> TGenotype,
"sa" -> TStruct(
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/is/hail/annotations/MemoryBlock.scala
Original file line number Diff line number Diff line change
Expand Up @@ -662,8 +662,8 @@ class RegionValueBuilder(region: MemoryBuffer) {
case t: TInterval =>
val i = a.asInstanceOf[Interval[Locus]]
startStruct()
addAnnotation(TLocus(t.gr), i.start)
addAnnotation(TLocus(t.gr), i.end)
addAnnotation(TLocus(t.rg), i.start)
addAnnotation(TLocus(t.rg), i.end)
endStruct()
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/is/hail/annotations/UnsafeRow.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import com.esotericsoftware.kryo.io.{Input, Output}
import com.esotericsoftware.kryo.{Kryo, KryoSerializable}
import is.hail.expr._
import is.hail.utils.Interval
import is.hail.variant.{AltAllele, GenericGenotype, GenomeReference, Locus, Variant}
import is.hail.variant.{AltAllele, GenericGenotype, ReferenceGenome, Locus, Variant}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.Row

Expand Down Expand Up @@ -76,7 +76,7 @@ object UnsafeRow {
new String(readBinary(region, offset))

def readLocus(region: MemoryBuffer, offset: Long): Locus = {
val ft = TLocus(GenomeReference.GRCh37).fundamentalType.asInstanceOf[TStruct]
val ft = TLocus(ReferenceGenome.GRCh37).fundamentalType.asInstanceOf[TStruct]
Locus(
readString(region, offset + ft.byteOffsets(0)),
region.loadInt(offset + ft.byteOffsets(1)))
Expand Down
8 changes: 4 additions & 4 deletions src/main/scala/is/hail/expr/AnnotationImpex.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package is.hail.expr

import is.hail.annotations.Annotation
import is.hail.utils.{Interval, _}
import is.hail.variant.{AltAllele, GenomeReference, Genotype, Locus, Variant}
import is.hail.variant.{AltAllele, ReferenceGenome, Genotype, Locus, Variant}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.json4s._
Expand Down Expand Up @@ -97,7 +97,7 @@ object SparkAnnotationImpex extends AnnotationImpex[DataType, Any] {
Locus(r.getAs[String](0), r.getAs[Int](1))
case x: TInterval =>
val r = a.asInstanceOf[Row]
Interval(importAnnotation(r.get(0), TLocus(x.gr)).asInstanceOf[Locus], importAnnotation(r.get(1), TLocus(x.gr)).asInstanceOf[Locus])
Interval(importAnnotation(r.get(0), TLocus(x.rg)).asInstanceOf[Locus], importAnnotation(r.get(1), TLocus(x.rg)).asInstanceOf[Locus])
case TStruct(fields) =>
if (fields.isEmpty)
if (a.asInstanceOf[Boolean]) Annotation.empty else null
Expand Down Expand Up @@ -217,10 +217,10 @@ case class JSONExtractInterval(start: Locus, end: Locus) {

case class JSONExtractContig(name: String, length: Int)

case class JSONExtractGenomeReference(name: String, contigs: Array[JSONExtractContig], xContigs: Set[String],
case class JSONExtractReferenceGenome(name: String, contigs: Array[JSONExtractContig], xContigs: Set[String],
yContigs: Set[String], mtContigs: Set[String], par: Array[JSONExtractInterval]) {

def toGenomeReference: GenomeReference = GenomeReference(name, contigs.map(_.name),
def toReferenceGenome: ReferenceGenome = ReferenceGenome(name, contigs.map(_.name),
contigs.map(c => (c.name, c.length)).toMap, xContigs, yContigs, mtContigs, par.map(_.toInterval))
}

Expand Down
Loading

0 comments on commit ee6527f

Please sign in to comment.