diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6ecd8fdfe..3203d93a3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,6 +27,7 @@ jobs: pip install wheel pip install -r requirements.txt pip install -r requirements-dev.txt + curl -sSL https://broad.io/install-gcs-connector | python3 - --auth-type UNAUTHENTICATED - name: Check formatting run: black --check gnomad tests - name: Check imports diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 3e3da1e92..28a02fe51 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -323,20 +323,31 @@ def tx_filter_variants_by_csqs( lambda csq: hl.is_defined(csq.amino_acids) & (csq.amino_acids != "*") ] - keep_csqs = True if ignore_splicing: - if filter_to_csqs is not None: - filter_to_csqs = [csq for csq in filter_to_csqs if csq not in CSQ_SPLICE] - else: - filter_to_csqs = CSQ_SPLICE - keep_csqs = False - - if filter_to_csqs is not None: - logger.info("Adding most severe consequence to VEP transcript consequences...") - ht = process_consequences( - ht, vep_root=vep_root, has_polyphen=include_polyphen_prioritization + logger.info("Filtering VEP consequences to exclude splice consequences...") + ht = ht.annotate( + **{ + vep_root: ht[vep_root].annotate( + transcript_consequences=ht[vep_root] + .transcript_consequences.map( + lambda csq: csq.annotate( + consequence_terms=csq.consequence_terms.filter( + lambda x: ~hl.literal(CSQ_SPLICE).contains(x) + ) + ) + ) + .filter(lambda csq: hl.len(csq.consequence_terms) > 0) + ) + } ) + logger.info("Processing VEP consequences...") + ht = process_consequences( + ht, + vep_root=vep_root, + has_polyphen=include_polyphen_prioritization, + ) + return filter_vep_transcript_csqs( ht, vep_root=vep_root, @@ -344,7 +355,6 @@ def tx_filter_variants_by_csqs( canonical=False, protein_coding=filter_to_protein_coding, csqs=filter_to_csqs, - keep_csqs=keep_csqs, genes=filter_to_genes, match_by_gene_symbol=match_by_gene_symbol, additional_filtering_criteria=additional_filtering_criteria, diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index fac02b9f3..4d389a9e7 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -14,6 +14,13 @@ ) +@pytest.fixture(autouse=True) +def reset_gnomad_public_resource_configuration(): + """Reset gnomAD public resource configuration after each test.""" + yield + gnomad_public_resource_configuration.source = None + + class TestTableResource: """Tests for TableResource.""" diff --git a/tests/utils/test_transcript_annotation.py b/tests/utils/test_transcript_annotation.py index 51661aa45..7ca204a04 100644 --- a/tests/utils/test_transcript_annotation.py +++ b/tests/utils/test_transcript_annotation.py @@ -6,6 +6,7 @@ from gnomad.utils.transcript_annotation import ( clean_tissue_name_for_browser, create_tx_annotation_by_region, + tx_filter_variants_by_csqs, ) @@ -192,3 +193,188 @@ def test_create_tx_annotation_by_region(self, sample_hail_table: hl.Table) -> No # Verify the result assert result == expected_result + + +@pytest.fixture +def mock_vep_annotated_ht(): + """Create a mock Hail Table with VEP annotations.""" + return hl.Table.parallelize( + [ + { + "locus": hl.Locus("1", 861393, reference_genome="GRCh37"), + "alleles": ["G", "A"], + "vep": { + "transcript_consequences": [ + { + "gene_id": "ENSG00000187634", + "gene_symbol": "SAMD11", + "transcript_id": "ENST00000342066", + "consequence_terms": [ + "splice_region_variant", + "synonymous_variant", + ], + "amino_acids": "V", + "biotype": "protein_coding", + "lof": None, + "lof_flags": None, + "canonical": 0, + }, + { + "gene_id": "ENSG00000268179", + "gene_symbol": "AL645608.1", + "transcript_id": "ENST00000598827", + "consequence_terms": ["synonymous_variant"], + "amino_acids": "T", + "biotype": "protein_coding", + "lof": None, + "lof_flags": None, + "canonical": 1, + }, + ], + }, + }, + { + "locus": hl.Locus("1", 871274, reference_genome="GRCh37"), + "alleles": ["C", "A"], + "vep": { + "transcript_consequences": [ + { + "gene_id": "ENSG00000187634", + "gene_symbol": "SAMD11", + "transcript_id": "ENST00000420190", + "consequence_terms": ["splice_region_variant"], + "amino_acids": None, + "biotype": "protein_coding", + "lof": None, + "lof_flags": None, + "canonical": 0, + } + ] + }, + }, + { + "locus": hl.Locus("1", 871275, reference_genome="GRCh37"), + "alleles": ["C", "A"], + "vep": { + "transcript_consequences": [ + { + "gene_id": "ENSG00000187634", + "gene_symbol": "SAMD11", + "transcript_id": "ENST00000420190", + "consequence_terms": [ + "splice_region_variant", + "synonymous_variant", + ], + "amino_acids": "A", + "biotype": "protein_coding", + "lof": None, + "lof_flags": None, + "canonical": 1, + } + ] + }, + }, + { + "locus": hl.Locus("1", 1000, reference_genome="GRCh37"), + "alleles": ["T", "G"], + "vep": { + "transcript_consequences": [ + { + "gene_id": "ENSG1", + "gene_symbol": "gene1", + "transcript_id": "ENST1", + "consequence_terms": ["stop_gained"], + "amino_acids": "Q/*", + "biotype": "protein_coding", + "lof": None, + "lof_flags": None, + "canonical": 1, + } + ] + }, + }, + { + "locus": hl.Locus("1", 2000, reference_genome="GRCh37"), + "alleles": ["A", "T"], + "vep": { + "transcript_consequences": [ + { + "gene_id": "ENSG2", + "gene_symbol": "gene2", + "transcript_id": "ENST2", + "consequence_terms": ["missense_variant"], + "amino_acids": "K/R", + "biotype": "nonsense_mediated_decay", + "lof": None, + "lof_flags": None, + "canonical": 1, + } + ] + }, + }, + ], + hl.tstruct( + locus=hl.tlocus(), + alleles=hl.tarray(hl.tstr), + vep=hl.tstruct( + transcript_consequences=hl.tarray( + hl.tstruct( + gene_id=hl.tstr, + gene_symbol=hl.tstr, + transcript_id=hl.tstr, + consequence_terms=hl.tarray(hl.tstr), + amino_acids=hl.tstr, + biotype=hl.tstr, + lof=hl.tstr, + lof_flags=hl.tstr, + canonical=hl.tint, + ) + ) + ), + ), + ) + + +class TestTxFilterVariantsByCsqs: + """Tests for the tx_filter_variants_by_csqs function.""" + + def test_filter_to_cds(self, mock_vep_annotated_ht): + """Test filtering to CDS variants.""" + result_ht = tx_filter_variants_by_csqs( + mock_vep_annotated_ht, + filter_to_cds=True, + ignore_splicing=False, + filter_to_protein_coding=False, + ) + assert result_ht.count() == 2 + + def test_filter_to_genes(self, mock_vep_annotated_ht): + """Test filtering to specific genes.""" + result_ht = tx_filter_variants_by_csqs( + mock_vep_annotated_ht, + filter_to_genes=["ENSG1", "ENSG2"], + filter_to_cds=False, + ignore_splicing=False, + filter_to_protein_coding=False, + ) + assert result_ht.count() == 2 + + def test_ignore_splicing(self, mock_vep_annotated_ht): + """Test ignoring splicing variants.""" + result_ht = tx_filter_variants_by_csqs( + mock_vep_annotated_ht, + filter_to_cds=False, + ignore_splicing=True, + filter_to_protein_coding=False, + ) + assert result_ht.count() == 4 + + def test_filter_to_protein_coding(self, mock_vep_annotated_ht): + """Test filtering to protein coding transcripts.""" + result_ht = tx_filter_variants_by_csqs( + mock_vep_annotated_ht, + filter_to_cds=False, + ignore_splicing=False, + filter_to_protein_coding=True, + ) + assert result_ht.count() == 4