Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix tx_filter_variants_by_csqs to correctly handle the ignore_splicing parameter #727

Merged
merged 11 commits into from
Aug 23, 2024
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
pip install wheel
pip install -r requirements.txt
pip install -r requirements-dev.txt
curl -sSL https://broad.io/install-gcs-connector | python3 - --auth-type UNAUTHENTICATED
- name: Check formatting
run: black --check gnomad tests
- name: Check imports
Expand Down
34 changes: 22 additions & 12 deletions gnomad/utils/transcript_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,28 +323,38 @@ def tx_filter_variants_by_csqs(
lambda csq: hl.is_defined(csq.amino_acids) & (csq.amino_acids != "*")
]

keep_csqs = True
if ignore_splicing:
if filter_to_csqs is not None:
filter_to_csqs = [csq for csq in filter_to_csqs if csq not in CSQ_SPLICE]
else:
filter_to_csqs = CSQ_SPLICE
keep_csqs = False

if filter_to_csqs is not None:
logger.info("Adding most severe consequence to VEP transcript consequences...")
ht = process_consequences(
ht, vep_root=vep_root, has_polyphen=include_polyphen_prioritization
logger.info("Filtering VEP consequences to exclude splice consequences...")
ht = ht.annotate(
**{
vep_root: ht[vep_root].annotate(
transcript_consequences=ht[vep_root]
.transcript_consequences.map(
lambda csq: csq.annotate(
consequence_terms=csq.consequence_terms.filter(
lambda x: ~hl.literal(CSQ_SPLICE).contains(x)
)
)
)
.filter(lambda csq: hl.len(csq.consequence_terms) > 0)
Comment on lines +330 to +339
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this code should actually be added to the VEP utils? my initial thought was to put it here, but it doesn't quite fit with that function

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, does this mean variants that had a splicing consequence as their worst consequence were also removed from the previous pext results? I found this line in the original repo https://github.com/macarthur-lab/tx_annotation/blob/master/tx_annotation.py#L305 (but didn't dig thoroughly)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually started to add it to the vep utils, but then decided against it because that was clearly filtering based on most severe consequence, and I thought it might be confusing to have both.

Copy link
Contributor Author

@jkgoodrich jkgoodrich Aug 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fix is to make it do what was done in the original. The line of code you reference picks the most severe consequence after removing the splice consequences from the ordering

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I keep going back and forth on whether this (filtering on any csq) is helpful to have, but maybe you're right that it might be too confusing

)
}
)

logger.info("Processing VEP consequences...")
ht = process_consequences(
ht,
vep_root=vep_root,
has_polyphen=include_polyphen_prioritization,
)

return filter_vep_transcript_csqs(
ht,
vep_root=vep_root,
synonymous=False,
canonical=False,
protein_coding=filter_to_protein_coding,
csqs=filter_to_csqs,
keep_csqs=keep_csqs,
genes=filter_to_genes,
match_by_gene_symbol=match_by_gene_symbol,
additional_filtering_criteria=additional_filtering_criteria,
Expand Down
7 changes: 7 additions & 0 deletions tests/resources/test_resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@
)


@pytest.fixture(autouse=True)
def reset_gnomad_public_resource_configuration():
"""Reset gnomAD public resource configuration after each test."""
yield
gnomad_public_resource_configuration.source = None


class TestTableResource:
"""Tests for TableResource."""

Expand Down
186 changes: 186 additions & 0 deletions tests/utils/test_transcript_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from gnomad.utils.transcript_annotation import (
clean_tissue_name_for_browser,
create_tx_annotation_by_region,
tx_filter_variants_by_csqs,
)


Expand Down Expand Up @@ -192,3 +193,188 @@ def test_create_tx_annotation_by_region(self, sample_hail_table: hl.Table) -> No

# Verify the result
assert result == expected_result


@pytest.fixture
def mock_vep_annotated_ht():
"""Create a mock Hail Table with VEP annotations."""
return hl.Table.parallelize(
[
{
"locus": hl.Locus("1", 861393, reference_genome="GRCh37"),
"alleles": ["G", "A"],
"vep": {
"transcript_consequences": [
{
"gene_id": "ENSG00000187634",
"gene_symbol": "SAMD11",
"transcript_id": "ENST00000342066",
"consequence_terms": [
"splice_region_variant",
"synonymous_variant",
],
"amino_acids": "V",
"biotype": "protein_coding",
"lof": None,
"lof_flags": None,
"canonical": 0,
},
{
"gene_id": "ENSG00000268179",
"gene_symbol": "AL645608.1",
"transcript_id": "ENST00000598827",
"consequence_terms": ["synonymous_variant"],
"amino_acids": "T",
"biotype": "protein_coding",
"lof": None,
"lof_flags": None,
"canonical": 1,
},
],
},
},
{
"locus": hl.Locus("1", 871274, reference_genome="GRCh37"),
"alleles": ["C", "A"],
"vep": {
"transcript_consequences": [
{
"gene_id": "ENSG00000187634",
"gene_symbol": "SAMD11",
"transcript_id": "ENST00000420190",
"consequence_terms": ["splice_region_variant"],
"amino_acids": None,
"biotype": "protein_coding",
"lof": None,
"lof_flags": None,
"canonical": 0,
}
]
},
},
{
"locus": hl.Locus("1", 871275, reference_genome="GRCh37"),
"alleles": ["C", "A"],
"vep": {
"transcript_consequences": [
{
"gene_id": "ENSG00000187634",
"gene_symbol": "SAMD11",
"transcript_id": "ENST00000420190",
"consequence_terms": [
"splice_region_variant",
"synonymous_variant",
],
"amino_acids": "A",
"biotype": "protein_coding",
"lof": None,
"lof_flags": None,
"canonical": 1,
}
]
},
},
{
"locus": hl.Locus("1", 1000, reference_genome="GRCh37"),
"alleles": ["T", "G"],
"vep": {
"transcript_consequences": [
{
"gene_id": "ENSG1",
"gene_symbol": "gene1",
"transcript_id": "ENST1",
"consequence_terms": ["stop_gained"],
"amino_acids": "Q/*",
"biotype": "protein_coding",
"lof": None,
"lof_flags": None,
"canonical": 1,
}
]
},
},
{
"locus": hl.Locus("1", 2000, reference_genome="GRCh37"),
"alleles": ["A", "T"],
"vep": {
"transcript_consequences": [
{
"gene_id": "ENSG2",
"gene_symbol": "gene2",
"transcript_id": "ENST2",
"consequence_terms": ["missense_variant"],
"amino_acids": "K/R",
"biotype": "nonsense_mediated_decay",
"lof": None,
"lof_flags": None,
"canonical": 1,
}
]
},
},
],
hl.tstruct(
locus=hl.tlocus(),
alleles=hl.tarray(hl.tstr),
vep=hl.tstruct(
transcript_consequences=hl.tarray(
hl.tstruct(
gene_id=hl.tstr,
gene_symbol=hl.tstr,
transcript_id=hl.tstr,
consequence_terms=hl.tarray(hl.tstr),
amino_acids=hl.tstr,
biotype=hl.tstr,
lof=hl.tstr,
lof_flags=hl.tstr,
canonical=hl.tint,
)
)
),
),
)


class TestTxFilterVariantsByCsqs:
"""Tests for the tx_filter_variants_by_csqs function."""

def test_filter_to_cds(self, mock_vep_annotated_ht):
"""Test filtering to CDS variants."""
result_ht = tx_filter_variants_by_csqs(
mock_vep_annotated_ht,
filter_to_cds=True,
ignore_splicing=False,
filter_to_protein_coding=False,
)
assert result_ht.count() == 2

def test_filter_to_genes(self, mock_vep_annotated_ht):
"""Test filtering to specific genes."""
result_ht = tx_filter_variants_by_csqs(
mock_vep_annotated_ht,
filter_to_genes=["ENSG1", "ENSG2"],
filter_to_cds=False,
ignore_splicing=False,
filter_to_protein_coding=False,
)
assert result_ht.count() == 2

def test_ignore_splicing(self, mock_vep_annotated_ht):
"""Test ignoring splicing variants."""
result_ht = tx_filter_variants_by_csqs(
mock_vep_annotated_ht,
filter_to_cds=False,
ignore_splicing=True,
filter_to_protein_coding=False,
)
assert result_ht.count() == 4

def test_filter_to_protein_coding(self, mock_vep_annotated_ht):
"""Test filtering to protein coding transcripts."""
result_ht = tx_filter_variants_by_csqs(
mock_vep_annotated_ht,
filter_to_cds=False,
ignore_splicing=False,
filter_to_protein_coding=True,
)
assert result_ht.count() == 4
Loading