Skip to content

Commit

Permalink
[query] Add compute_charr (#12485)
Browse files Browse the repository at this point in the history
* [query] Add `compute_charr`

* fix lints

* fix lint

Co-authored-by: Dan King <daniel.zidan.king@gmail.com>
  • Loading branch information
tpoterba and danking authored Nov 29, 2022
1 parent 1bfdc5c commit b033274
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 3 deletions.
3 changes: 2 additions & 1 deletion hail/python/hail/methods/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
linear_regression_rows, _linear_regression_rows_nd, logistic_regression_rows,
_logistic_regression_rows_nd, poisson_regression_rows,
linear_mixed_regression_rows, lambda_gc)
from .qc import sample_qc, variant_qc, vep, concordance, nirvana, summarize_variants
from .qc import sample_qc, variant_qc, vep, concordance, nirvana, summarize_variants, compute_charr
from .misc import rename_duplicates, maximal_independent_set, segment_intervals, filter_intervals
from .relatedness import identity_by_descent, king, pc_relate

Expand Down Expand Up @@ -80,6 +80,7 @@
'filter_alleles',
'filter_alleles_hts',
'summarize_variants',
'compute_charr',
'row_correlation',
'ld_matrix',
'king'
Expand Down
128 changes: 126 additions & 2 deletions hail/python/hail/methods/qc.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import hail as hl
from collections import Counter
import os
from typing import Tuple, List, Union
from hail.typecheck import typecheck, oneof, anytype, nullable
from typing import Tuple, List, Union, Optional
from hail.typecheck import typecheck, oneof, anytype, nullable, numeric
from hail.expr.expressions.expression_typecheck import expr_float64
from hail.utils.java import Env, info, warning
from hail.utils.misc import divide_null, guess_cloud_spark_provider
from hail.matrixtable import MatrixTable
Expand Down Expand Up @@ -1173,3 +1174,126 @@ def explode_result(alleles):
allele_counts=allele_counts,
n_variants=n_variants,
r_ti_tv=nti / ntv)


@typecheck(ds=oneof(hl.MatrixTable, hl.vds.VariantDataset),
min_af=numeric,
max_af=numeric,
min_dp=int,
max_dp=int,
min_gq=int,
ref_AF=nullable(expr_float64))
def compute_charr(
ds: Union[hl.MatrixTable, hl.vds.VariantDataset],
min_af: float = 0.05,
max_af: float = 0.95,
min_dp: int = 10,
max_dp: int = 100,
min_gq: int = 20,
ref_AF: Optional[hl.Float64Expression] = None
):
"""Compute CHARR, the DNA sample contamination estimator.
.. include:: _templates/experimental.rst
Notes
-----
The returned table has the sample ID field, plus the field:
- `charr` (float64): CHARR contamination estimation.
Note
-----
It is possible to use gnomAD reference allele frequencies with the following:
>>> gnomad_sites = hl.experimental.load_dataset('gnomad_genome_sites', version='3.1.2') # doctest: +SKIP
>>> charr_result = hl.compute_charr(mt, ref_af=(1 - gnomad_sites[mt.row_key].freq[1])) # doctest: +SKIP
Parameters
----------
ds : :class:`.MatrixTable` or :class:`.VariantDataset`
Dataset.
min_af
Minimum reference allele frequency to filter variants.
max_af
Maximum reference allele frequency to filter variants.
min_dp
Minimum sequencing depth to filter variants.
max_dp
Maximum sequencing depth to filter variants.
min_gq
Minimum genotype quality to filter variants
ref_AF
Reference AF expression. Necessary when the sample size is below 10,000.
Returns
-------
:class:`.Table`
"""

# Determine whether the input data is in the VDS format; if not, convert matrixtable to VDS and extract only the variant call information
if isinstance(ds, hl.vds.VariantDataset):
mt = ds.variant_data
else:
mt = ds

if all(x in mt.entry for x in ['LA', 'LAD', 'LGT', 'GQ']):
ad_field = 'LAD'
gt_field = 'LGT'
elif all(x in mt.entry for x in ['AD', 'GT', 'GQ']):
ad_field = 'AD'
gt_field = 'GT'
else:
raise ValueError(f"'compute_charr': require a VDS or MatrixTable with fields LAD/LAD/LGT/GQ/DP or AD/GT/GQ/DP,"
f" found entry fields {list(mt.entry)}")
# Annotate reference allele frequency when it is not defined in the original data, and name it 'ref_AF'.
ref_af_field = '__ref_af'
if ref_AF is None:
n_samples = mt.count_cols()
if n_samples < 10000:
raise ValueError("'compute_charr': with fewer than 10,000 samples, require a reference AF in 'reference_data_source'.")

n_alleles = 2 * n_samples
mt = mt.annotate_rows(
**{ref_af_field: 1 - hl.agg.sum(mt[gt_field].n_alt_alleles()) / n_alleles}
)
else:
mt = mt.annotate_rows(**{ref_af_field: ref_AF})

# Filter to autosomal biallelic SNVs with reference allele frequency within the range (min_af, max_af)
rg = mt.locus.dtype.reference_genome.name
if rg == 'GRCh37':
mt = hl.filter_intervals(mt, [hl.parse_locus_interval('1-22', reference_genome=rg)])
elif rg == 'GRCh38':
mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr1-chr22', reference_genome=rg)])
else:
mt = mt.filter_rows(mt.locus.in_autosome())

mt = mt.filter_rows(
(hl.len(mt.alleles) == 2)
& hl.is_snp(mt.alleles[0], mt.alleles[1])
& (mt[ref_af_field] > min_af)
& (mt[ref_af_field] < max_af)
)

# Filter to variant calls with GQ above min_gq and DP within the range (min_dp, max_dp)
ad_dp = mt['DP'] if 'DP' in mt.entry else hl.sum(mt[ad_field])
mt = mt.filter_entries(
mt[gt_field].is_hom_var() & (mt.GQ >= min_gq) & (ad_dp >= min_dp) & (ad_dp <= max_dp)
)

# Compute CHARR
mt = mt.select_cols(
charr=hl.agg.mean((mt[ad_field][0] / (mt[ad_field][0] + mt[ad_field][1])) / mt[ref_af_field])
)

mt = mt.select_globals(
af_min=min_af,
af_max=max_af,
dp_min=min_dp,
dp_max=max_dp,
gq_min=min_gq,
)

return mt.cols()
9 changes: 9 additions & 0 deletions hail/python/test/hail/methods/test_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,12 @@ def test_summarize_variants_ti_tv(self):
assert r['n_variants'] == 346
assert r['r_ti_tv'] == 2.5
assert r['allele_counts'] == {2: 346}

def test_charr(self):
mt = hl.import_vcf(resource('sample.vcf'))
es = mt.select_rows().entries()
charr = hl.compute_charr(mt, ref_AF=0.9)
d = charr.aggregate(hl.dict(hl.agg.collect((charr.s, charr.charr))))

assert pytest.approx(d['C1046::HG02024'], abs=0.0001) == .00126
assert pytest.approx(d['C1046::HG02025'], abs=0.0001) == .00124

0 comments on commit b033274

Please sign in to comment.