From b033274fee608140ce3aca3aef2ca5b836c25998 Mon Sep 17 00:00:00 2001 From: Tim Poterba Date: Tue, 29 Nov 2022 12:20:42 -0500 Subject: [PATCH] [query] Add `compute_charr` (#12485) * [query] Add `compute_charr` * fix lints * fix lint Co-authored-by: Dan King --- hail/python/hail/methods/__init__.py | 3 +- hail/python/hail/methods/qc.py | 128 ++++++++++++++++++++++- hail/python/test/hail/methods/test_qc.py | 9 ++ 3 files changed, 137 insertions(+), 3 deletions(-) diff --git a/hail/python/hail/methods/__init__.py b/hail/python/hail/methods/__init__.py index 64374c9926b..207c9d4716d 100644 --- a/hail/python/hail/methods/__init__.py +++ b/hail/python/hail/methods/__init__.py @@ -13,7 +13,7 @@ linear_regression_rows, _linear_regression_rows_nd, logistic_regression_rows, _logistic_regression_rows_nd, poisson_regression_rows, linear_mixed_regression_rows, lambda_gc) -from .qc import sample_qc, variant_qc, vep, concordance, nirvana, summarize_variants +from .qc import sample_qc, variant_qc, vep, concordance, nirvana, summarize_variants, compute_charr from .misc import rename_duplicates, maximal_independent_set, segment_intervals, filter_intervals from .relatedness import identity_by_descent, king, pc_relate @@ -80,6 +80,7 @@ 'filter_alleles', 'filter_alleles_hts', 'summarize_variants', + 'compute_charr', 'row_correlation', 'ld_matrix', 'king' diff --git a/hail/python/hail/methods/qc.py b/hail/python/hail/methods/qc.py index 0dd23890c2e..bbee11a1f4d 100644 --- a/hail/python/hail/methods/qc.py +++ b/hail/python/hail/methods/qc.py @@ -1,8 +1,9 @@ import hail as hl from collections import Counter import os -from typing import Tuple, List, Union -from hail.typecheck import typecheck, oneof, anytype, nullable +from typing import Tuple, List, Union, Optional +from hail.typecheck import typecheck, oneof, anytype, nullable, numeric +from hail.expr.expressions.expression_typecheck import expr_float64 from hail.utils.java import Env, info, warning from hail.utils.misc import divide_null, guess_cloud_spark_provider from hail.matrixtable import MatrixTable @@ -1173,3 +1174,126 @@ def explode_result(alleles): allele_counts=allele_counts, n_variants=n_variants, r_ti_tv=nti / ntv) + + +@typecheck(ds=oneof(hl.MatrixTable, hl.vds.VariantDataset), + min_af=numeric, + max_af=numeric, + min_dp=int, + max_dp=int, + min_gq=int, + ref_AF=nullable(expr_float64)) +def compute_charr( + ds: Union[hl.MatrixTable, hl.vds.VariantDataset], + min_af: float = 0.05, + max_af: float = 0.95, + min_dp: int = 10, + max_dp: int = 100, + min_gq: int = 20, + ref_AF: Optional[hl.Float64Expression] = None +): + """Compute CHARR, the DNA sample contamination estimator. + + .. include:: _templates/experimental.rst + + Notes + ----- + + The returned table has the sample ID field, plus the field: + + - `charr` (float64): CHARR contamination estimation. + + Note + ----- + It is possible to use gnomAD reference allele frequencies with the following: + + >>> gnomad_sites = hl.experimental.load_dataset('gnomad_genome_sites', version='3.1.2') # doctest: +SKIP + >>> charr_result = hl.compute_charr(mt, ref_af=(1 - gnomad_sites[mt.row_key].freq[1])) # doctest: +SKIP + + Parameters + ---------- + ds : :class:`.MatrixTable` or :class:`.VariantDataset` + Dataset. + min_af + Minimum reference allele frequency to filter variants. + max_af + Maximum reference allele frequency to filter variants. + min_dp + Minimum sequencing depth to filter variants. + max_dp + Maximum sequencing depth to filter variants. + min_gq + Minimum genotype quality to filter variants + ref_AF + Reference AF expression. Necessary when the sample size is below 10,000. + + Returns + ------- + :class:`.Table` + """ + + # Determine whether the input data is in the VDS format; if not, convert matrixtable to VDS and extract only the variant call information + if isinstance(ds, hl.vds.VariantDataset): + mt = ds.variant_data + else: + mt = ds + + if all(x in mt.entry for x in ['LA', 'LAD', 'LGT', 'GQ']): + ad_field = 'LAD' + gt_field = 'LGT' + elif all(x in mt.entry for x in ['AD', 'GT', 'GQ']): + ad_field = 'AD' + gt_field = 'GT' + else: + raise ValueError(f"'compute_charr': require a VDS or MatrixTable with fields LAD/LAD/LGT/GQ/DP or AD/GT/GQ/DP," + f" found entry fields {list(mt.entry)}") + # Annotate reference allele frequency when it is not defined in the original data, and name it 'ref_AF'. + ref_af_field = '__ref_af' + if ref_AF is None: + n_samples = mt.count_cols() + if n_samples < 10000: + raise ValueError("'compute_charr': with fewer than 10,000 samples, require a reference AF in 'reference_data_source'.") + + n_alleles = 2 * n_samples + mt = mt.annotate_rows( + **{ref_af_field: 1 - hl.agg.sum(mt[gt_field].n_alt_alleles()) / n_alleles} + ) + else: + mt = mt.annotate_rows(**{ref_af_field: ref_AF}) + + # Filter to autosomal biallelic SNVs with reference allele frequency within the range (min_af, max_af) + rg = mt.locus.dtype.reference_genome.name + if rg == 'GRCh37': + mt = hl.filter_intervals(mt, [hl.parse_locus_interval('1-22', reference_genome=rg)]) + elif rg == 'GRCh38': + mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr1-chr22', reference_genome=rg)]) + else: + mt = mt.filter_rows(mt.locus.in_autosome()) + + mt = mt.filter_rows( + (hl.len(mt.alleles) == 2) + & hl.is_snp(mt.alleles[0], mt.alleles[1]) + & (mt[ref_af_field] > min_af) + & (mt[ref_af_field] < max_af) + ) + + # Filter to variant calls with GQ above min_gq and DP within the range (min_dp, max_dp) + ad_dp = mt['DP'] if 'DP' in mt.entry else hl.sum(mt[ad_field]) + mt = mt.filter_entries( + mt[gt_field].is_hom_var() & (mt.GQ >= min_gq) & (ad_dp >= min_dp) & (ad_dp <= max_dp) + ) + + # Compute CHARR + mt = mt.select_cols( + charr=hl.agg.mean((mt[ad_field][0] / (mt[ad_field][0] + mt[ad_field][1])) / mt[ref_af_field]) + ) + + mt = mt.select_globals( + af_min=min_af, + af_max=max_af, + dp_min=min_dp, + dp_max=max_dp, + gq_min=min_gq, + ) + + return mt.cols() diff --git a/hail/python/test/hail/methods/test_qc.py b/hail/python/test/hail/methods/test_qc.py index 2c374ca8cbd..a0d041c807a 100644 --- a/hail/python/test/hail/methods/test_qc.py +++ b/hail/python/test/hail/methods/test_qc.py @@ -279,3 +279,12 @@ def test_summarize_variants_ti_tv(self): assert r['n_variants'] == 346 assert r['r_ti_tv'] == 2.5 assert r['allele_counts'] == {2: 346} + + def test_charr(self): + mt = hl.import_vcf(resource('sample.vcf')) + es = mt.select_rows().entries() + charr = hl.compute_charr(mt, ref_AF=0.9) + d = charr.aggregate(hl.dict(hl.agg.collect((charr.s, charr.charr)))) + + assert pytest.approx(d['C1046::HG02024'], abs=0.0001) == .00126 + assert pytest.approx(d['C1046::HG02025'], abs=0.0001) == .00124