Skip to content

Commit

Permalink
Merge pull request #746 from broadinstitute/jg/add_variant_str_parser
Browse files Browse the repository at this point in the history
Add function `parse_variant` to create a Struct with the locus and alleles from a variant string or contig, position, ref, and alt.
  • Loading branch information
jkgoodrich authored Dec 20, 2024
2 parents d8b64f5 + 2e8b643 commit 727887c
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 0 deletions.
66 changes: 66 additions & 0 deletions gnomad/utils/parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""This module contains utility functions for general parsing."""

import logging
from typing import Optional

import hail as hl

logging.basicConfig(
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger("parse_utils")
logger.setLevel(logging.INFO)


def parse_variant(
variant_str: Optional[str] = None,
contig: Optional[str] = None,
position: Optional[int] = None,
ref: Optional[str] = None,
alt: Optional[str] = None,
build: Optional[str] = None,
) -> hl.expr.StructExpression:
"""
Create a Struct with the locus and alleles from a variant string or contig, position, ref, and alt.
:param variant_str: Variant string in the format contig-position-ref-alt or
contig:position:ref:alt.
:param contig: Chromosome of the variant.
:param position: Variant position.
:param ref: Reference allele.
:param alt: Alternate allele.
:param build: Reference genome build. If not provided, will infer from the variant
string or contig. If 'chr' is present in the contig, will assume GRCh38,
otherwise GRCh37.
:return: Struct with the locus and alleles.
"""
if not variant_str and not all([contig, position, ref, alt]):
raise ValueError(
"Either `variant_str` must be provided or all of `contig`, `position`, "
"`ref`, and `alt`."
)

if not build:
build = "GRCh37"
if (variant_str and variant_str.startswith("chr")) or (
contig and contig.startswith("chr")
):
build = "GRCh38"

logger.info("No build provided. Assuming build: %s", build)

try:
if variant_str and ":" not in variant_str:
contig, position, ref, alt = variant_str.split("-")
if all([contig, position, ref, alt]):
variant_str = f"{contig}:{position}:{ref}:{alt}"

return hl.parse_variant(variant_str, reference_genome=build)

except BaseException:
raise ValueError(
f"Invalid variant format: {variant_str}. Valid formats: \n"
f" contig-position-ref-alt \n"
f" contig:position:ref:alt"
)
80 changes: 80 additions & 0 deletions tests/utils/test_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Tests for the parse utility module."""

import hail as hl
import pytest

from gnomad.utils.parse import parse_variant


class TestParseVariant:
"""Test the parse_variant function."""

grch37_out = hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])
grch38_out = hl.Struct(locus=hl.Locus("chr1", 1000, "GRCh38"), alleles=["A", "T"])

@pytest.mark.parametrize(
"variant_str, contig, position, ref, alt, build, expected",
[
("1-1000-A-T", None, None, None, None, None, grch37_out),
("chr1-1000-A-T", None, None, None, None, None, grch38_out),
(None, "1", 1000, "A", "T", None, grch37_out),
(None, "chr1", 1000, "A", "T", None, grch38_out),
(None, "1", 1000, "A", "T", "GRCh37", grch37_out),
("1:1000:A:T", None, None, None, None, None, grch37_out),
],
)
def test_parse_variant(
self,
variant_str: str,
contig: str,
position: int,
ref: str,
alt: str,
build: str,
expected: hl.expr.StructExpression,
) -> None:
"""
Test valid parameters for the `parse_variant` function.
:param variant_str: Variant string.
:param contig: Chromosome of the variant.
:param position: Variant position.
:param ref: Reference allele.
:param alt: Alternate allele.
:param build: Reference genome build.
:param expected: Expected result.
:return: None.
"""
result = hl.eval(parse_variant(variant_str, contig, position, ref, alt, build))
assert result == expected

@pytest.mark.parametrize(
"variant_str, contig, position, ref, alt, build",
[
(None, None, None, None, None, None),
("invalid_variant", None, None, None, None, None),
(None, "1", None, "A", "T", None),
],
)
def test_parse_variant_invalid(
self,
variant_str: str,
contig: str,
position: int,
ref: str,
alt: str,
build: str,
) -> None:
"""
Test invalid parameters for the `parse_variant` function.
:param variant_str: Variant string.
:param contig: Chromosome of the variant.
:param position: Variant position.
:param ref: Reference allele.
:param alt: Alternate allele.
:param build: Reference genome build.
:return: None.
"""
with pytest.raises(ValueError):
parse_variant(variant_str, contig, position, ref, alt, build)

0 comments on commit 727887c

Please sign in to comment.