diff --git a/hail/.gitignore b/hail/.gitignore index eebbbc4cbe3..506cc43dd2c 100644 --- a/hail/.gitignore +++ b/hail/.gitignore @@ -10,6 +10,7 @@ python/hail/docs/_build/* python/hail/docs/_static/hail_version.js python/hailtop/dist python/hailtop/hailctl/deploy.yaml +python/hail/docs/genetics/hail.genetics.AlleleType.rst python/hail/docs/genetics/hail.genetics.Call.rst python/hail/docs/genetics/hail.genetics.Locus.rst python/hail/docs/genetics/hail.genetics.Pedigree.rst @@ -65,6 +66,7 @@ python/hail/docs/vds/hail.vds.to_merged_sparse_mt.rst python/hail/docs/vds/hail.vds.local_to_global.rst python/hail/docs/vds/hail.vds.merge_reference_blocks.rst python/hail/docs/vds/hail.vds.truncate_reference_blocks.rst +python/hail/docs/tutorials/iframe_figures/ src/main/c/.cxx.vsn src/main/c/headers src/main/c/lib diff --git a/hail/python/hail/docs/functions/genetics.rst b/hail/python/hail/docs/functions/genetics.rst index b7b5265cbae..4569a919e2a 100644 --- a/hail/python/hail/docs/functions/genetics.rst +++ b/hail/python/hail/docs/functions/genetics.rst @@ -31,6 +31,7 @@ Genetics functions is_valid_locus contig_length allele_type + numeric_allele_type pl_dosage gp_dosage get_sequence @@ -65,6 +66,7 @@ Genetics functions .. autofunction:: is_valid_locus .. autofunction:: contig_length .. autofunction:: allele_type +.. autofunction:: numeric_allele_type .. autofunction:: pl_dosage .. autofunction:: gp_dosage .. autofunction:: get_sequence diff --git a/hail/python/hail/docs/genetics/index.rst b/hail/python/hail/docs/genetics/index.rst index c203b1de41f..44b3756576b 100644 --- a/hail/python/hail/docs/genetics/index.rst +++ b/hail/python/hail/docs/genetics/index.rst @@ -14,6 +14,7 @@ genetics :toctree: ./ :template: class.rst + AlleleType Call Locus Pedigree diff --git a/hail/python/hail/expr/__init__.py b/hail/python/hail/expr/__init__.py index 5a246fe3cb6..23739d77270 100644 --- a/hail/python/hail/expr/__init__.py +++ b/hail/python/hail/expr/__init__.py @@ -181,6 +181,7 @@ is_complex, is_strand_ambiguous, allele_type, + numeric_allele_type, hamming, mendel_error_code, triangle, @@ -406,6 +407,7 @@ 'is_complex', 'is_strand_ambiguous', 'allele_type', + 'numeric_allele_type', 'hamming', 'mendel_error_code', 'triangle', diff --git a/hail/python/hail/expr/functions.py b/hail/python/hail/expr/functions.py index 20f1a108df0..198e2534fa2 100644 --- a/hail/python/hail/expr/functions.py +++ b/hail/python/hail/expr/functions.py @@ -85,6 +85,7 @@ is_float32, is_float64, ) +from hail.genetics.allele_type import AlleleType from hail.genetics.reference_genome import reference_genome_type, ReferenceGenome import hail.ir as ir from hail.typecheck import ( @@ -3278,16 +3279,26 @@ def corr(x, y) -> Float64Expression: return _func("corr", tfloat64, x, y) -_base_regex = "^([ACGTNM])+$" -_symbolic_regex = r"(^\.)|(\.$)|(^<)|(>$)|(\[)|(\])" -_allele_types = ["Unknown", "SNP", "MNP", "Insertion", "Deletion", "Complex", "Star", "Symbolic"] -_allele_enum = {i: v for i, v in builtins.enumerate(_allele_types)} -_allele_ints = {v: k for k, v in _allele_enum.items()} - - @typecheck(ref=expr_str, alt=expr_str) @ir.udf(tstr, tstr) -def _num_allele_type(ref, alt) -> Int32Expression: +def numeric_allele_type(ref, alt) -> Int32Expression: + """Returns the type of the polymorphism as an integer. The value returned + is the integer value of :class:`.AlleleType` representing that kind of + polymorphism. + + Examples + -------- + + >>> hl.eval(hl.numeric_allele_type('A', 'T')) == AlleleType.SNP + True + + Notes + ----- + The values of :class:`.AlleleType` are not stable and thus should not be + relied upon across hail versions. + """ + _base_regex = "^([ACGTNM])+$" + _symbolic_regex = r"(^\.)|(\.$)|(^<)|(>$)|(\[)|(\])" return hl.bind( lambda r, a: hl.if_else( r.matches(_base_regex), @@ -3299,24 +3310,33 @@ def _num_allele_type(ref, alt) -> Int32Expression: r.length() == a.length(), hl.if_else( r.length() == 1, - hl.if_else(r != a, _allele_ints['SNP'], _allele_ints['Unknown']), - hl.if_else(hamming(r, a) == 1, _allele_ints['SNP'], _allele_ints['MNP']), + hl.if_else(r != a, AlleleType.SNP, AlleleType.UNKNOWN), + hl.if_else(hamming(r, a) == 1, AlleleType.SNP, AlleleType.MNP), ), ) - .when((r.length() < a.length()) & (r[0] == a[0]) & a.endswith(r[1:]), _allele_ints["Insertion"]) - .when((r[0] == a[0]) & r.endswith(a[1:]), _allele_ints["Deletion"]) - .default(_allele_ints['Complex']), + .when((r.length() < a.length()) & (r[0] == a[0]) & a.endswith(r[1:]), AlleleType.INSERTION) + .when((r[0] == a[0]) & r.endswith(a[1:]), AlleleType.DELETION) + .default(AlleleType.COMPLEX), ) - .when(a == '*', _allele_ints['Star']) - .when(a.matches(_symbolic_regex), _allele_ints['Symbolic']) - .default(_allele_ints['Unknown']), - _allele_ints['Unknown'], + .when(a == '*', AlleleType.STAR) + .when(a.matches(_symbolic_regex), AlleleType.SYMBOLIC) + .default(AlleleType.UNKNOWN), + AlleleType.UNKNOWN, ), ref, alt, ) +@deprecated(version='0.2.129', reason="Replaced by the public numeric_allele_type") +@typecheck(ref=expr_str, alt=expr_str) +def _num_allele_type(ref, alt) -> Int32Expression: + """Provided for backwards compatibility, don't use it in new code, or + within the hail library itself + """ + return numeric_allele_type(ref, alt) + + @typecheck(ref=expr_str, alt=expr_str) def is_snp(ref, alt) -> BooleanExpression: """Returns ``True`` if the alleles constitute a single nucleotide polymorphism. @@ -3338,7 +3358,7 @@ def is_snp(ref, alt) -> BooleanExpression: ------- :class:`.BooleanExpression` """ - return _num_allele_type(ref, alt) == _allele_ints["SNP"] + return numeric_allele_type(ref, alt) == AlleleType.SNP @typecheck(ref=expr_str, alt=expr_str) @@ -3362,7 +3382,7 @@ def is_mnp(ref, alt) -> BooleanExpression: ------- :class:`.BooleanExpression` """ - return _num_allele_type(ref, alt) == _allele_ints["MNP"] + return numeric_allele_type(ref, alt) == AlleleType.MNP @typecheck(ref=expr_str, alt=expr_str) @@ -3458,7 +3478,7 @@ def is_insertion(ref, alt) -> BooleanExpression: ------- :class:`.BooleanExpression` """ - return _num_allele_type(ref, alt) == _allele_ints["Insertion"] + return numeric_allele_type(ref, alt) == AlleleType.INSERTION @typecheck(ref=expr_str, alt=expr_str) @@ -3482,7 +3502,7 @@ def is_deletion(ref, alt) -> BooleanExpression: ------- :class:`.BooleanExpression` """ - return _num_allele_type(ref, alt) == _allele_ints["Deletion"] + return numeric_allele_type(ref, alt) == AlleleType.DELETION @typecheck(ref=expr_str, alt=expr_str) @@ -3506,9 +3526,7 @@ def is_indel(ref, alt) -> BooleanExpression: ------- :class:`.BooleanExpression` """ - return hl.bind( - lambda t: (t == _allele_ints["Insertion"]) | (t == _allele_ints["Deletion"]), _num_allele_type(ref, alt) - ) + return hl.bind(lambda t: (t == AlleleType.INSERTION) | (t == AlleleType.DELETION), numeric_allele_type(ref, alt)) @typecheck(ref=expr_str, alt=expr_str) @@ -3532,7 +3550,7 @@ def is_star(ref, alt) -> BooleanExpression: ------- :class:`.BooleanExpression` """ - return _num_allele_type(ref, alt) == _allele_ints["Star"] + return numeric_allele_type(ref, alt) == AlleleType.STAR @typecheck(ref=expr_str, alt=expr_str) @@ -3556,7 +3574,7 @@ def is_complex(ref, alt) -> BooleanExpression: ------- :class:`.BooleanExpression` """ - return _num_allele_type(ref, alt) == _allele_ints["Complex"] + return numeric_allele_type(ref, alt) == AlleleType.COMPLEX @typecheck(ref=expr_str, alt=expr_str) @@ -3624,7 +3642,7 @@ def allele_type(ref, alt) -> StringExpression: ------- :class:`.StringExpression` """ - return hl.literal(_allele_types)[_num_allele_type(ref, alt)] + return hl.literal(AlleleType.strings())[numeric_allele_type(ref, alt)] @typecheck(s1=expr_str, s2=expr_str) diff --git a/hail/python/hail/genetics/__init__.py b/hail/python/hail/genetics/__init__.py index 30bead08cb0..ee7ff734ead 100644 --- a/hail/python/hail/genetics/__init__.py +++ b/hail/python/hail/genetics/__init__.py @@ -1,6 +1,7 @@ +from .allele_type import AlleleType from .call import Call from .reference_genome import ReferenceGenome from .pedigree import Pedigree, Trio from .locus import Locus -__all__ = ['Locus', 'Call', 'Pedigree', 'Trio', 'ReferenceGenome'] +__all__ = ['AlleleType', 'Locus', 'Call', 'Pedigree', 'Trio', 'ReferenceGenome'] diff --git a/hail/python/hail/genetics/allele_type.py b/hail/python/hail/genetics/allele_type.py new file mode 100644 index 00000000000..97a874e2c3e --- /dev/null +++ b/hail/python/hail/genetics/allele_type.py @@ -0,0 +1,96 @@ +from enum import IntEnum, auto + + +_ALLELE_STRS = ( + "Unknown", + "SNP", + "MNP", + "Insertion", + "Deletion", + "Complex", + "Star", + "Symbolic", + "Transition", + "Transversion", +) + + +class AlleleType(IntEnum): + """An enumeration for allele type. + + Notes + ----- + The precise values of the enumeration constants are not guarenteed + to be stable and must not be relied upon. + """ + + UNKNOWN = 0 + """Unknown Allele Type""" + SNP = auto() + """Single-nucleotide Polymorphism (SNP)""" + MNP = auto() + """Multi-nucleotide Polymorphism (MNP)""" + INSERTION = auto() + """Insertion""" + DELETION = auto() + """Deletion""" + COMPLEX = auto() + """Complex Polymorphism""" + STAR = auto() + """Star Allele (``alt=*``)""" + SYMBOLIC = auto() + """Symbolic Allele + + e.g. ``alt=`` + """ + TRANSITION = auto() + """Transition SNP + + e.g. ``ref=A alt=G`` + + Note + ---- + This is only really used internally in :func:`hail.vds.sample_qc` and + :func:`hail.methods.sample_qc`. + """ + TRANSVERSION = auto() + """Transversion SNP + + e.g. ``ref=A alt=C`` + + Note + ---- + This is only really used internally in :func:`hail.vds.sample_qc` and + :func:`hail.methods.sample_qc`. + """ + + def __str__(self): + return str(self.value) + + @property + def pretty_name(self): + """A formatted (as opposed to uppercase) version of the member's name, + to match :func:`~hail.expr.functions.allele_type` + + Examples + -------- + >>> AlleleType.INSERTION.pretty_name + 'Insertion' + >>> at = AlleleType(hl.eval(hl.numeric_allele_type('a', 'att'))) + >>> at.pretty_name == hl.eval(hl.allele_type('a', 'att')) + True + """ + return _ALLELE_STRS[self] + + @classmethod + def _missing_(cls, value): + if not isinstance(value, str): + return None + return cls.__members__.get(value.upper()) + + @staticmethod + def strings(): + """Returns the names of the allele types, for use with + :func:`~hail.expr.functions.literal` + """ + return list(_ALLELE_STRS) diff --git a/hail/python/hail/methods/qc.py b/hail/python/hail/methods/qc.py index f99c1f99536..8f3841cffd3 100644 --- a/hail/python/hail/methods/qc.py +++ b/hail/python/hail/methods/qc.py @@ -15,8 +15,10 @@ from hailtop import yamlx from hail.backend.service_backend import ServiceBackend +from hail.genetics.allele_type import AlleleType from hail.typecheck import typecheck, oneof, anytype, nullable, numeric from hail.expr.expressions.expression_typecheck import expr_float64 +from hail.expr.functions import numeric_allele_type from hail.utils import FatalError from hail.utils.java import Env, info, warning from hail.utils.misc import divide_null, guess_cloud_spark_provider, new_temp_file @@ -42,6 +44,17 @@ ) +def _qc_allele_type(ref, alt): + return hl.bind( + lambda at: hl.if_else( + at == AlleleType.SNP, + hl.if_else(hl.is_transition(ref, alt), AlleleType.TRANSITION, AlleleType.TRANSVERSION), + at, + ), + numeric_allele_type(ref, alt), + ) + + @typecheck(mt=MatrixTable, name=str) def sample_qc(mt, name='sample_qc') -> MatrixTable: """Compute per-sample metrics useful for quality control. @@ -115,28 +128,11 @@ def sample_qc(mt, name='sample_qc') -> MatrixTable: require_row_key_variant(mt, 'sample_qc') - from hail.expr.functions import _num_allele_type, _allele_types - - allele_types = _allele_types[:] - allele_types.extend(['Transition', 'Transversion']) - allele_enum = {i: v for i, v in enumerate(allele_types)} - allele_ints = {v: k for k, v in allele_enum.items()} - - def allele_type(ref, alt): - return hl.bind( - lambda at: hl.if_else( - at == allele_ints['SNP'], - hl.if_else(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), - at, - ), - _num_allele_type(ref, alt), - ) - variant_ac = Env.get_uid() variant_atypes = Env.get_uid() mt = mt.annotate_rows(**{ variant_ac: hl.agg.call_stats(mt.GT, mt.alleles).AC, - variant_atypes: mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt)), + variant_atypes: mt.alleles[1:].map(lambda alt: _qc_allele_type(mt.alleles[0], alt)), }) bound_exprs = {} @@ -175,7 +171,7 @@ def has_field_of_type(name, dtype): ) bound_exprs['allele_type_counts'] = hl.agg.explode( - lambda allele_type: hl.tuple(hl.agg.count_where(allele_type == i) for i in range(len(allele_ints))), + lambda allele_type: hl.tuple(hl.agg.count_where(allele_type == i) for i in range(len(AlleleType))), ( hl.range(0, mt['GT'].ploidy) .map(lambda i: mt['GT'][i]) @@ -198,14 +194,12 @@ def has_field_of_type(name, dtype): 'n_hom_var': x.n_called - x.n_hom_ref - x.n_het, 'n_non_ref': x.n_called - x.n_hom_ref, 'n_singleton': x.n_singleton, - 'n_snp': ( - x.allele_type_counts[allele_ints["Transition"]] + x.allele_type_counts[allele_ints["Transversion"]] - ), - 'n_insertion': x.allele_type_counts[allele_ints["Insertion"]], - 'n_deletion': x.allele_type_counts[allele_ints["Deletion"]], - 'n_transition': x.allele_type_counts[allele_ints["Transition"]], - 'n_transversion': x.allele_type_counts[allele_ints["Transversion"]], - 'n_star': x.allele_type_counts[allele_ints["Star"]], + 'n_snp': x.allele_type_counts[AlleleType.TRANSITION] + x.allele_type_counts[AlleleType.TRANSVERSION], + 'n_insertion': x.allele_type_counts[AlleleType.INSERTION], + 'n_deletion': x.allele_type_counts[AlleleType.DELETION], + 'n_transition': x.allele_type_counts[AlleleType.TRANSITION], + 'n_transversion': x.allele_type_counts[AlleleType.TRANSVERSION], + 'n_star': x.allele_type_counts[AlleleType.STAR], }), lambda s: s.annotate( r_ti_tv=divide_null(hl.float64(s.n_transition), s.n_transversion), diff --git a/hail/python/hail/vds/combiner/combine.py b/hail/python/hail/vds/combiner/combine.py index 9d4662e6fd6..e1d6d85eced 100644 --- a/hail/python/hail/vds/combiner/combine.py +++ b/hail/python/hail/vds/combiner/combine.py @@ -6,6 +6,8 @@ from hail.experimental.function import Function from hail.expr import StructExpression, unify_all, construct_expr from hail.expr.expressions import expr_bool, expr_str +from hail.expr.functions import numeric_allele_type +from hail.genetics.allele_type import AlleleType from hail.genetics.reference_genome import reference_genome_type from hail.ir import Apply, TableMapRows from hail.typecheck import oneof, sequenceof, typecheck @@ -459,8 +461,6 @@ def unlocalize(mt): def merge_alleles(alleles): - from hail.expr.functions import _num_allele_type, _allele_ints - return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')).fold(lambda s, t: hl.if_else(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( @@ -470,13 +470,13 @@ def merge_alleles(alleles): lambda r: hl.array([ref]).extend( al[1:].map( lambda a: hl.rbind( - _num_allele_type(r, a), + numeric_allele_type(r, a), lambda at: hl.if_else( - (_allele_ints['SNP'] == at) - | (_allele_ints['Insertion'] == at) - | (_allele_ints['Deletion'] == at) - | (_allele_ints['MNP'] == at) - | (_allele_ints['Complex'] == at), + (at == AlleleType.SNP) + | (at == AlleleType.INSERTION) + | (at == AlleleType.DELETION) + | (at == AlleleType.MNP) + | (at == AlleleType.COMPLEX), a + ref[hl.len(r) :], a, ), diff --git a/hail/python/hail/vds/sample_qc.py b/hail/python/hail/vds/sample_qc.py index 27011269cfa..6cb6f8044df 100644 --- a/hail/python/hail/vds/sample_qc.py +++ b/hail/python/hail/vds/sample_qc.py @@ -10,7 +10,9 @@ NumericExpression, StructExpression, ) +from hail.genetics.allele_type import AlleleType from hail.methods.misc import require_first_key_field_locus +from hail.methods.qc import _qc_allele_type from hail.table import Table from hail.typecheck import sequenceof, typecheck, nullable from hail.utils.java import Env @@ -42,24 +44,8 @@ def vmt_sample_qc_variant_annotations( Tuple of expressions representing the AC (first element) and allele type (second element). """ - from hail.expr.functions import _num_allele_type, _allele_types - - allele_types = _allele_types[:] - allele_types.extend(['Transition', 'Transversion']) - allele_enum = dict(enumerate(allele_types)) - allele_ints = {v: k for k, v in allele_enum.items()} - - def allele_type(ref, alt): - return hl.bind( - lambda at: hl.if_else( - at == allele_ints['SNP'], - hl.if_else(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), - at, - ), - _num_allele_type(ref, alt), - ) - return (hl.agg.call_stats(global_gt, alleles).AC, alleles[1:].map(lambda alt: allele_type(alleles[0], alt))) + return (hl.agg.call_stats(global_gt, alleles).AC, alleles[1:].map(lambda alt: _qc_allele_type(alleles[0], alt))) @typecheck( @@ -132,13 +118,6 @@ def vmt_sample_qc( } """ - from hail.expr.functions import _allele_types - - allele_types = _allele_types[:] - allele_types.extend(['Transition', 'Transversion']) - allele_enum = dict(enumerate(allele_types)) - allele_ints = {v: k for k, v in allele_enum.items()} - bound_exprs = {} bound_exprs['n_het'] = hl.agg.count_where(global_gt.is_het()) @@ -162,7 +141,7 @@ def vmt_sample_qc( global_gt[i], lambda gti: (gti != 0) & (variant_ac[gti] == 1) - & (variant_atypes[gti - 1] == allele_ints['Transition']), + & (variant_atypes[gti - 1] == AlleleType.TRANSITION), ) ) ), @@ -177,7 +156,7 @@ def vmt_sample_qc( global_gt[i], lambda gti: (gti != 0) & (variant_ac[gti] == 1) - & (variant_atypes[gti - 1] == allele_ints['Transversion']), + & (variant_atypes[gti - 1] == AlleleType.TRANSVERSION), ) ) ), @@ -185,7 +164,7 @@ def vmt_sample_qc( ) bound_exprs['allele_type_counts'] = hl.agg.explode( - lambda allele_type: hl.tuple(hl.agg.count_where(allele_type == i) for i in range(len(allele_ints))), + lambda allele_type: hl.tuple(hl.agg.count_where(allele_type == i) for i in range(len(AlleleType))), ( hl.range(0, global_gt.ploidy) .map(lambda i: global_gt[i]) @@ -211,14 +190,12 @@ def vmt_sample_qc( 'n_singleton': x.n_singleton, 'n_singleton_ti': x.n_singleton_ti, 'n_singleton_tv': x.n_singleton_tv, - 'n_snp': ( - x.allele_type_counts[allele_ints['Transition']] + x.allele_type_counts[allele_ints['Transversion']] - ), - 'n_insertion': x.allele_type_counts[allele_ints['Insertion']], - 'n_deletion': x.allele_type_counts[allele_ints['Deletion']], - 'n_transition': x.allele_type_counts[allele_ints['Transition']], - 'n_transversion': x.allele_type_counts[allele_ints['Transversion']], - 'n_star': x.allele_type_counts[allele_ints['Star']], + 'n_snp': x.allele_type_counts[AlleleType.TRANSITION] + x.allele_type_counts[AlleleType.TRANSVERSION], + 'n_insertion': x.allele_type_counts[AlleleType.INSERTION], + 'n_deletion': x.allele_type_counts[AlleleType.DELETION], + 'n_transition': x.allele_type_counts[AlleleType.TRANSITION], + 'n_transversion': x.allele_type_counts[AlleleType.TRANSVERSION], + 'n_star': x.allele_type_counts[AlleleType.STAR], }), lambda s: s.annotate( r_ti_tv=divide_null(hl.float64(s.n_transition), s.n_transversion),