Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(l2g): implement variant consequence features from VEP #805

Merged
merged 18 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
8fa0d72
feat(variant_index): set variant functional consequence to pathogenic…
ireneisdoomed Oct 1, 2024
5742436
feat: add and test vep features
ireneisdoomed Oct 1, 2024
4e8903a
feat: add and test vep neighbourhood features
ireneisdoomed Oct 1, 2024
8b233e4
feat(vep_features): use only protein coding genes to look in the vici…
ireneisdoomed Oct 1, 2024
46be5de
chore: update feature list and other bits
ireneisdoomed Oct 1, 2024
70e9adc
fix: docs ref
ireneisdoomed Oct 1, 2024
6919997
Merge branch 'dev' of https://github.com/opentargets/gentropy into il…
ireneisdoomed Oct 1, 2024
418c47b
refactor: set variant functional consequence to pathogenicity score a…
ireneisdoomed Oct 2, 2024
cdbc361
chore: drop `VariantIndex.get_most_severe_gene_consequence`
ireneisdoomed Oct 2, 2024
62c13f2
chore(VariantIndex): make `CONSEQUENCE_TO_PATHOGENICITY_SCORE` a clas…
ireneisdoomed Oct 3, 2024
4ccd5d0
fix(vep): convert `id_to_score_map` to `label_to_score_map`
ireneisdoomed Oct 3, 2024
f892d23
Merge branch 'dev' into il-csq_to_score_property
ireneisdoomed Oct 3, 2024
320f430
chore: remove comment
ireneisdoomed Oct 3, 2024
24e476a
Merge branch 'il-csq_to_score_property' of https://github.com/opentar…
ireneisdoomed Oct 3, 2024
604e8ba
Merge branch 'dev' of https://github.com/opentargets/gentropy into il…
ireneisdoomed Oct 3, 2024
7344ef0
Merge branch 'il-csq_to_score_property' of https://github.com/opentar…
ireneisdoomed Oct 3, 2024
20e74d8
chore: access max consequence score from variant index
ireneisdoomed Oct 3, 2024
48e6a30
Merge branch 'dev' of https://github.com/opentargets/gentropy into il…
ireneisdoomed Oct 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/python_api/datasets/l2g_features/vep.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
title: From VEP
---

## List of features

::: gentropy.dataset.l2g_features.vep.VepMeanFeature
::: gentropy.dataset.l2g_features.vep.VepMeanNeighbourhoodFeature
::: gentropy.dataset.l2g_features.vep.VepMaximumFeature
::: gentropy.dataset.l2g_features.vep.VepMaximumNeighbourhoodFeature

## Common logic

::: gentropy.dataset.l2g_features.vep.common_vep_feature_logic
::: gentropy.dataset.l2g_features.vep.common_neighbourhood_vep_feature_logic
6 changes: 6 additions & 0 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ class LocusToGeneConfig(StepConfig):
variant_index_path: str = MISSING
colocalisation_path: str = MISSING
study_index_path: str = MISSING
gene_index_path: str = MISSING
model_path: str | None = None
feature_matrix_path: str | None = None
gold_standard_curation_path: str | None = None
Expand All @@ -264,6 +265,11 @@ class LocusToGeneConfig(StepConfig):
"distanceTssMeanNeighbourhood",
"distanceSentinelTss",
"distanceSentinelTssNeighbourhood",
# vep
"vepMaximum",
"vepMaximumNeighbourhood",
"vepMean",
"vepMeanNeighbourhood",
]
)
hyperparameters: dict[str, Any] = field(
Expand Down
274 changes: 274 additions & 0 deletions src/gentropy/dataset/l2g_features/vep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
"""Collection of methods that extract distance features from the variant index dataset."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

import pyspark.sql.functions as f
from pyspark.sql import Window

from gentropy.common.spark_helpers import convert_from_wide_to_long
from gentropy.dataset.gene_index import GeneIndex
from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
from gentropy.dataset.study_locus import StudyLocus
from gentropy.dataset.variant_index import VariantIndex

if TYPE_CHECKING:
from pyspark.sql import DataFrame


def common_vep_feature_logic(
study_loci_to_annotate: L2GGoldStandard | StudyLocus,
*,
variant_index: VariantIndex,
feature_name: str,
) -> DataFrame:
"""Extracts variant severity score computed from VEP.

Args:
study_loci_to_annotate (L2GGoldStandard | StudyLocus): The dataset containing study loci that will be used for annotation
variant_index (VariantIndex): The dataset containing functional consequence information
feature_name (str): The name of the feature

Returns:
DataFrame: Feature dataset
"""
# Variant/Target/Severity dataframe
consequences_dataset = variant_index.df.withColumn(
"transcriptConsequence", f.explode("transcriptConsequences")
).select(
"variantId",
f.col("transcriptConsequence.targetId").alias("geneId"),
f.col("transcriptConsequence.consequenceScore").alias("severityScore"),
)
if isinstance(study_loci_to_annotate, StudyLocus):
variants_df = (
study_loci_to_annotate.df.withColumn(
"variantInLocus", f.explode_outer("locus")
)
.select(
"studyLocusId",
f.col("variantInLocus.variantId").alias("variantId"),
f.col("variantInLocus.posteriorProbability").alias(
"posteriorProbability"
),
)
.join(consequences_dataset, "variantId")
)
elif isinstance(study_loci_to_annotate, L2GGoldStandard):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we still have L2GGoldStandard? Isn't it going to be removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As agreed offline, we'll do that once we tackle opentargets/issues#3525

variants_df = study_loci_to_annotate.df.select(
"studyLocusId", "variantId", f.lit(1.0).alias("posteriorProbability")
).join(consequences_dataset, "variantId")

if "Maximum" in feature_name:
agg_expr = f.max("severityScore")
elif "Mean" in feature_name:
variants_df = variants_df.withColumn(
"weightedScore", f.col("severityScore") * f.col("posteriorProbability")
)
agg_expr = f.mean("weightedScore")
return variants_df.groupBy("studyLocusId", "geneId").agg(
agg_expr.alias(feature_name)
)


def common_neighbourhood_vep_feature_logic(
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
*,
variant_index: VariantIndex,
gene_index: GeneIndex,
feature_name: str,
) -> DataFrame:
"""Extracts variant severity score computed from VEP for any gene, based on what is the mean score for protein coding genes that are nearby the locus.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
variant_index (VariantIndex): The dataset containing functional consequence information
gene_index (GeneIndex): The dataset containing the gene biotype
feature_name (str): The name of the feature

Returns:
DataFrame: Feature dataset
"""
local_feature_name = feature_name.replace("Neighbourhood", "")
# First compute mean distances to a gene
local_metric = common_vep_feature_logic(
study_loci_to_annotate,
feature_name=local_feature_name,
variant_index=variant_index,
)
return (
# Then compute mean distance in the vicinity (feature will be the same for any gene associated with a studyLocus)
local_metric.join(
# Bring gene classification
gene_index.df.select("geneId", "biotype"),
"geneId",
"inner",
)
.withColumn(
"regional_metric",
f.coalesce(
# Calculate mean based on protein coding genes
f.mean(
f.when(
f.col("biotype") == "protein_coding", f.col(local_feature_name)
)
).over(Window.partitionBy("studyLocusId")),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we assign the genes for this studyLocusId?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Genes come from VEP. It's the genes of the transcript that the variant overlaps with in a 500kb window

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For variantId - yes, but we have many varintIds in one studyLocusId. So just to be sure - we use the union across al varaintIds? And if varintId is not assigned to geneId we assume its VEP score is 0?

# Default to 0 if there are no protein coding genes
f.lit(0),
),
)
.withColumn(feature_name, f.col(local_feature_name) - f.col("regional_metric"))
.drop("regional_metric", local_feature_name, "biotype")
)


class VepMaximumFeature(L2GFeature):
"""Maximum functional consequence score among all variants in a credible set for a studyLocus/gene."""

feature_dependency_type = VariantIndex
feature_name = "vepMaximum"

@classmethod
def compute(
cls: type[VepMaximumFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> VepMaximumFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

Returns:
VepMaximumFeature: Feature dataset
"""
return cls(
_df=convert_from_wide_to_long(
common_vep_feature_logic(
study_loci_to_annotate=study_loci_to_annotate,
feature_name=cls.feature_name,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class VepMaximumNeighbourhoodFeature(L2GFeature):
"""Maximum functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity."""

feature_dependency_type = [VariantIndex, GeneIndex]
feature_name = "vepMaximumNeighbourhood"

@classmethod
def compute(
cls: type[VepMaximumNeighbourhoodFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> VepMaximumNeighbourhoodFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

Returns:
VepMaximumNeighbourhoodFeature: Feature dataset
"""
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_vep_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class VepMeanFeature(L2GFeature):
"""Average functional consequence score among all variants in a credible set for a studyLocus/gene.

The mean severity score is weighted by the posterior probability of each variant.
"""

feature_dependency_type = VariantIndex
feature_name = "vepMean"

@classmethod
def compute(
cls: type[VepMeanFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> VepMeanFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

Returns:
VepMeanFeature: Feature dataset
"""
return cls(
_df=convert_from_wide_to_long(
common_vep_feature_logic(
study_loci_to_annotate=study_loci_to_annotate,
feature_name=cls.feature_name,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)


class VepMeanNeighbourhoodFeature(L2GFeature):
"""Mean functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity.

The mean severity score is weighted by the posterior probability of each variant.
"""

feature_dependency_type = [VariantIndex, GeneIndex]
feature_name = "vepMeanNeighbourhood"

@classmethod
def compute(
cls: type[VepMeanNeighbourhoodFeature],
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
feature_dependency: dict[str, Any],
) -> VepMeanNeighbourhoodFeature:
"""Computes the feature.

Args:
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

Returns:
VepMeanNeighbourhoodFeature: Feature dataset
"""
return cls(
_df=convert_from_wide_to_long(
common_neighbourhood_vep_feature_logic(
study_loci_to_annotate,
feature_name=cls.feature_name,
**feature_dependency,
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
),
_schema=cls.get_schema(),
)
2 changes: 1 addition & 1 deletion src/gentropy/dataset/variant_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Dataset definition for variant annotation."""
"""Dataset definition for variant index."""

from __future__ import annotations

Expand Down
9 changes: 9 additions & 0 deletions src/gentropy/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from gentropy.common.utils import access_gcp_secret
from gentropy.config import LocusToGeneConfig
from gentropy.dataset.colocalisation import Colocalisation
from gentropy.dataset.gene_index import GeneIndex
from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
from gentropy.dataset.l2g_prediction import L2GPrediction
Expand Down Expand Up @@ -41,6 +42,7 @@ def __init__(
variant_index_path: str | None = None,
colocalisation_path: str | None = None,
study_index_path: str | None = None,
gene_index_path: str | None = None,
gene_interactions_path: str | None = None,
predictions_path: str | None = None,
feature_matrix_path: str | None = None,
Expand All @@ -62,6 +64,7 @@ def __init__(
variant_index_path (str | None): Path to the variant index dataset
colocalisation_path (str | None): Path to the colocalisation dataset
study_index_path (str | None): Path to the study index dataset
gene_index_path (str | None): Path to the gene index dataset
gene_interactions_path (str | None): Path to the gene interactions dataset
predictions_path (str | None): Path to the L2G predictions output dataset
feature_matrix_path (str | None): Path to the L2G feature matrix output dataset
Expand Down Expand Up @@ -108,11 +111,17 @@ def __init__(
if colocalisation_path
else None
)
self.gene_index = (
GeneIndex.from_parquet(session, gene_index_path, recursiveFileLookup=True)
if gene_index_path
else None
)
self.features_input_loader = L2GFeatureInputLoader(
variant_index=self.variant_index,
coloc=self.coloc,
studies=self.studies,
study_locus=self.credible_set,
gene_index=self.gene_index,
)

if run_mode == "predict":
Expand Down
10 changes: 10 additions & 0 deletions src/gentropy/method/l2g/feature_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@
DistanceTssMeanNeighbourhoodFeature,
)
from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
from gentropy.dataset.l2g_features.vep import (
VepMaximumFeature,
VepMaximumNeighbourhoodFeature,
VepMeanFeature,
VepMeanNeighbourhoodFeature,
)
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
from gentropy.dataset.study_locus import StudyLocus

Expand Down Expand Up @@ -117,6 +123,10 @@ class FeatureFactory:
"sQtlColocH4MaximumNeighbourhood": SQtlColocH4MaximumNeighbourhoodFeature,
"tuQtlColocH4Maximum": TuQtlColocH4MaximumFeature,
"tuQtlColocH4MaximumNeighbourhood": TuQtlColocH4MaximumNeighbourhoodFeature,
"vepMean": VepMeanFeature,
"vepMeanNeighbourhood": VepMeanNeighbourhoodFeature,
"vepMaximum": VepMaximumFeature,
"vepMaximumNeighbourhood": VepMaximumNeighbourhoodFeature,
}

def __init__(
Expand Down
Loading
Loading