From 19e400916e4bdb97b66c32e8c5bae3ebcb8c084f Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 19 Dec 2024 09:55:39 -0700 Subject: [PATCH 1/4] Add function `parse_variant` to create a Struct with the locus and alleles from a variant string or contig, position, ref, and alt. --- gnomad/utils/parse.py | 63 ++++++++++++++++++++++++++++++++ tests/utils/test_parse.py | 75 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 gnomad/utils/parse.py create mode 100644 tests/utils/test_parse.py diff --git a/gnomad/utils/parse.py b/gnomad/utils/parse.py new file mode 100644 index 000000000..1a63a3a29 --- /dev/null +++ b/gnomad/utils/parse.py @@ -0,0 +1,63 @@ +"""This module contains utility functions for general parsing.""" +import logging +from typing import Optional + +import hail as hl + +logging.basicConfig( + format="%(asctime)s (%(name)s %(lineno)s): %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", +) +logger = logging.getLogger("parse_utils") +logger.setLevel(logging.INFO) + + +def parse_variant( + variant_str: Optional[str] = None, + contig: Optional[str] = None, + position: Optional[int] = None, + ref: Optional[str] = None, + alt: Optional[str] = None, + build: Optional[str] = None, +) -> hl.expr.StructExpression: + """ + Create a Struct with the locus and alleles from a variant string or contig, position, ref, and alt. + + :param variant_str: Variant string in the format contig-position-ref-alt or + contig:position:ref:alt. + :param contig: Chromosome of the variant. + :param position: Variant position. + :param ref: Reference allele. + :param alt: Alternate allele. + :param build: Reference genome build. If not provided, will infer from the variant + string or contig. If 'chr' is present in the contig, will assume GRCh38, + otherwise GRCh37. + :return: Struct with the locus and alleles. + """ + if not variant_str and not all([contig, position, ref, alt]): + raise ValueError( + "Either `variant_str` must be provided or all of `contig`, `position`, " + "`ref`, and `alt`." + ) + + if not build: + build = "GRCh37" + if (variant_str and variant_str.startswith("chr")) or (contig and contig.startswith("chr")): + build = "GRCh38" + + logger.info("No build provided. Assuming build: %s", build) + + try: + if variant_str and ":" not in variant_str: + contig, position, ref, alt = variant_str.split("-") + if all([contig, position, ref, alt]): + variant_str = f"{contig}:{position}:{ref}:{alt}" + + return hl.parse_variant(variant_str, reference_genome=build) + + except: + raise ValueError( + f"Invalid variant format: {variant_str}. Valid formats: \n" + f" contig-position-ref-alt \n" + f" contig:position:ref:alt" + ) diff --git a/tests/utils/test_parse.py b/tests/utils/test_parse.py new file mode 100644 index 000000000..56c4311e9 --- /dev/null +++ b/tests/utils/test_parse.py @@ -0,0 +1,75 @@ +"""Tests for the parse utility module.""" + +import pytest +import hail as hl +from gnomad.utils.parse import parse_variant + +class TestParseVariant: + """Test the parse_variant function.""" + + @pytest.mark.parametrize( + "variant_str, contig, position, ref, alt, build, expected", + [ + ("1-1000-A-T", None, None, None, None, None, hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])), + ("chr1-1000-A-T", None, None, None, None, None, hl.Struct(locus=hl.Locus("chr1", 1000, "GRCh38"), alleles=["A", "T"])), + (None, "1", 1000, "A", "T", None, hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])), + (None, "chr1", 1000, "A", "T", None, hl.Struct(locus=hl.Locus("chr1", 1000, "GRCh38"), alleles=["A", "T"])), + (None, "1", 1000, "A", "T", "GRCh37", hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])), + ("1:1000:A:T", None, None, None, None, None, hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])), + ], + ) + def test_parse_variant( + self, + variant_str: str, + contig: str, + position: int, + ref: str, + alt: str, + build: str, + expected: hl.expr.StructExpression, + ) -> None: + """ + Test valid parameters for the `parse_variant` function. + + :param variant_str: Variant string. + :param contig: Chromosome of the variant. + :param position: Variant position. + :param ref: Reference allele. + :param alt: Alternate allele. + :param build: Reference genome build. + :param expected: Expected result. + :return: None. + """ + result = hl.eval(parse_variant(variant_str, contig, position, ref, alt, build)) + assert result == expected + + @pytest.mark.parametrize( + "variant_str, contig, position, ref, alt, build", + [ + (None, None, None, None, None, None), + ("invalid_variant", None, None, None, None, None), + (None, "1", None, "A", "T", None), + ], + ) + def test_parse_variant_invalid( + self, + variant_str: str, + contig: str, + position: int, + ref: str, + alt: str, + build: str, + ) -> None: + """ + Test invalid parameters for the `parse_variant` function. + + :param variant_str: Variant string. + :param contig: Chromosome of the variant. + :param position: Variant position. + :param ref: Reference allele. + :param alt: Alternate allele. + :param build: Reference genome build. + :return: None. + """ + with pytest.raises(ValueError): + parse_variant(variant_str, contig, position, ref, alt, build) From e2154d0286a6ac3b35c36721c806e71d886b3c00 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:06:02 -0700 Subject: [PATCH 2/4] Format --- gnomad/utils/parse.py | 5 ++++- tests/utils/test_parse.py | 16 ++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/parse.py b/gnomad/utils/parse.py index 1a63a3a29..6393f01ce 100644 --- a/gnomad/utils/parse.py +++ b/gnomad/utils/parse.py @@ -1,4 +1,5 @@ """This module contains utility functions for general parsing.""" + import logging from typing import Optional @@ -42,7 +43,9 @@ def parse_variant( if not build: build = "GRCh37" - if (variant_str and variant_str.startswith("chr")) or (contig and contig.startswith("chr")): + if (variant_str and variant_str.startswith("chr")) or ( + contig and contig.startswith("chr") + ): build = "GRCh38" logger.info("No build provided. Assuming build: %s", build) diff --git a/tests/utils/test_parse.py b/tests/utils/test_parse.py index 56c4311e9..9c255ed11 100644 --- a/tests/utils/test_parse.py +++ b/tests/utils/test_parse.py @@ -4,18 +4,22 @@ import hail as hl from gnomad.utils.parse import parse_variant + class TestParseVariant: """Test the parse_variant function.""" + grch37_out = hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"]) + grch38_out = hl.Struct(locus=hl.Locus("chr1", 1000, "GRCh38"), alleles=["A", "T"]) + @pytest.mark.parametrize( "variant_str, contig, position, ref, alt, build, expected", [ - ("1-1000-A-T", None, None, None, None, None, hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])), - ("chr1-1000-A-T", None, None, None, None, None, hl.Struct(locus=hl.Locus("chr1", 1000, "GRCh38"), alleles=["A", "T"])), - (None, "1", 1000, "A", "T", None, hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])), - (None, "chr1", 1000, "A", "T", None, hl.Struct(locus=hl.Locus("chr1", 1000, "GRCh38"), alleles=["A", "T"])), - (None, "1", 1000, "A", "T", "GRCh37", hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])), - ("1:1000:A:T", None, None, None, None, None, hl.Struct(locus=hl.Locus("1", 1000, "GRCh37"), alleles=["A", "T"])), + ("1-1000-A-T", None, None, None, None, None, grch37_out), + ("chr1-1000-A-T", None, None, None, None, None, grch38_out), + (None, "1", 1000, "A", "T", None, grch37_out), + (None, "chr1", 1000, "A", "T", None, grch38_out), + (None, "1", 1000, "A", "T", "GRCh37", grch37_out), + ("1:1000:A:T", None, None, None, None, None, grch37_out), ], ) def test_parse_variant( From 1a4c8b38541298ceb6068898b7fe173428a91361 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:18:31 -0700 Subject: [PATCH 3/4] isort --- tests/utils/test_parse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_parse.py b/tests/utils/test_parse.py index 9c255ed11..710429a29 100644 --- a/tests/utils/test_parse.py +++ b/tests/utils/test_parse.py @@ -1,7 +1,8 @@ """Tests for the parse utility module.""" -import pytest import hail as hl +import pytest + from gnomad.utils.parse import parse_variant From 2e8b64311a4af3af6edd00096c1faf4948b5280a Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 19 Dec 2024 11:51:32 -0700 Subject: [PATCH 4/4] Use BaseException --- gnomad/utils/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/parse.py b/gnomad/utils/parse.py index 6393f01ce..1d65f51d7 100644 --- a/gnomad/utils/parse.py +++ b/gnomad/utils/parse.py @@ -58,7 +58,7 @@ def parse_variant( return hl.parse_variant(variant_str, reference_genome=build) - except: + except BaseException: raise ValueError( f"Invalid variant format: {variant_str}. Valid formats: \n" f" contig-position-ref-alt \n"