opentargets · ireneisdoomed · Oct 3, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/docs/python_api/datasets/l2g_features/vep.md b/docs/python_api/datasets/l2g_features/vep.md
@@ -0,0 +1,15 @@
+---
+title: From VEP
+---
+
+## List of features
+
+::: gentropy.dataset.l2g_features.vep.VepMeanFeature
+::: gentropy.dataset.l2g_features.vep.VepMeanNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.vep.VepMaximumFeature
+::: gentropy.dataset.l2g_features.vep.VepMaximumNeighbourhoodFeature
+
+## Common logic
+
+::: gentropy.dataset.l2g_features.vep.common_vep_feature_logic
+::: gentropy.dataset.l2g_features.vep.common_neighbourhood_vep_feature_logic
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -238,6 +238,7 @@ class LocusToGeneConfig(StepConfig):
     variant_index_path: str = MISSING
     colocalisation_path: str = MISSING
     study_index_path: str = MISSING
+    gene_index_path: str = MISSING
     model_path: str | None = None
     feature_matrix_path: str | None = None
     gold_standard_curation_path: str | None = None
@@ -264,6 +265,11 @@ class LocusToGeneConfig(StepConfig):
             "distanceTssMeanNeighbourhood",
             "distanceSentinelTss",
             "distanceSentinelTssNeighbourhood",
+            # vep
+            "vepMaximum",
+            "vepMaximumNeighbourhood",
+            "vepMean",
+            "vepMeanNeighbourhood",
         ]
     )
     hyperparameters: dict[str, Any] = field(

diff --git a/src/gentropy/dataset/l2g_features/vep.py b/src/gentropy/dataset/l2g_features/vep.py
@@ -0,0 +1,274 @@
+"""Collection of methods that extract distance features from the variant index dataset."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pyspark.sql.functions as f
+from pyspark.sql import Window
+
+from gentropy.common.spark_helpers import convert_from_wide_to_long
+from gentropy.dataset.gene_index import GeneIndex
+from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
+from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
+from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.variant_index import VariantIndex
+
+if TYPE_CHECKING:
+    from pyspark.sql import DataFrame
+
+
+def common_vep_feature_logic(
+    study_loci_to_annotate: L2GGoldStandard | StudyLocus,
+    *,
+    variant_index: VariantIndex,
+    feature_name: str,
+) -> DataFrame:
+    """Extracts variant severity score computed from VEP.
+
+    Args:
+        study_loci_to_annotate (L2GGoldStandard | StudyLocus): The dataset containing study loci that will be used for annotation
+        variant_index (VariantIndex): The dataset containing functional consequence information
+        feature_name (str): The name of the feature
+
+    Returns:
+        DataFrame: Feature dataset
+    """
+    # Variant/Target/Severity dataframe
+    consequences_dataset = variant_index.df.withColumn(
+        "transcriptConsequence", f.explode("transcriptConsequences")
+    ).select(
+        "variantId",
+        f.col("transcriptConsequence.targetId").alias("geneId"),
+        f.col("transcriptConsequence.consequenceScore").alias("severityScore"),
+    )
+    if isinstance(study_loci_to_annotate, StudyLocus):
+        variants_df = (
+            study_loci_to_annotate.df.withColumn(
+                "variantInLocus", f.explode_outer("locus")
+            )
+            .select(
+                "studyLocusId",
+                f.col("variantInLocus.variantId").alias("variantId"),
+                f.col("variantInLocus.posteriorProbability").alias(
+                    "posteriorProbability"
+                ),
+            )
+            .join(consequences_dataset, "variantId")
+        )
+    elif isinstance(study_loci_to_annotate, L2GGoldStandard):
+        variants_df = study_loci_to_annotate.df.select(
+            "studyLocusId", "variantId", f.lit(1.0).alias("posteriorProbability")
+        ).join(consequences_dataset, "variantId")
+
+    if "Maximum" in feature_name:
+        agg_expr = f.max("severityScore")
+    elif "Mean" in feature_name:
+        variants_df = variants_df.withColumn(
+            "weightedScore", f.col("severityScore") * f.col("posteriorProbability")
+        )
+        agg_expr = f.mean("weightedScore")
+    return variants_df.groupBy("studyLocusId", "geneId").agg(
+        agg_expr.alias(feature_name)
+    )
+
+
+def common_neighbourhood_vep_feature_logic(
+    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+    *,
+    variant_index: VariantIndex,
+    gene_index: GeneIndex,
+    feature_name: str,
+) -> DataFrame:
+    """Extracts variant severity score computed from VEP for any gene, based on what is the mean score for protein coding genes that are nearby the locus.
+
+    Args:
+        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
+        variant_index (VariantIndex): The dataset containing functional consequence information
+        gene_index (GeneIndex): The dataset containing the gene biotype
+        feature_name (str): The name of the feature
+
+    Returns:
+        DataFrame: Feature dataset
+    """
+    local_feature_name = feature_name.replace("Neighbourhood", "")
+    # First compute mean distances to a gene
+    local_metric = common_vep_feature_logic(
+        study_loci_to_annotate,
+        feature_name=local_feature_name,
+        variant_index=variant_index,
+    )
+    return (
+        # Then compute mean distance in the vicinity (feature will be the same for any gene associated with a studyLocus)
+        local_metric.join(
+            # Bring gene classification
+            gene_index.df.select("geneId", "biotype"),
+            "geneId",
+            "inner",
+        )
+        .withColumn(
+            "regional_metric",
+            f.coalesce(
+                # Calculate mean based on protein coding genes
+                f.mean(
+                    f.when(
+                        f.col("biotype") == "protein_coding", f.col(local_feature_name)
+                    )
+                ).over(Window.partitionBy("studyLocusId")),
+                # Default to 0 if there are no protein coding genes
+                f.lit(0),
+            ),
+        )
+        .withColumn(feature_name, f.col(local_feature_name) - f.col("regional_metric"))
+        .drop("regional_metric", local_feature_name, "biotype")
+    )
+
+
+class VepMaximumFeature(L2GFeature):
+    """Maximum functional consequence score among all variants in a credible set for a studyLocus/gene."""
+
+    feature_dependency_type = VariantIndex
+    feature_name = "vepMaximum"
+
+    @classmethod
+    def compute(
+        cls: type[VepMaximumFeature],
+        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+        feature_dependency: dict[str, Any],
+    ) -> VepMaximumFeature:
+        """Computes the feature.
+
+        Args:
+            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
+            feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information
+
+        Returns:
+            VepMaximumFeature: Feature dataset
+        """
+        return cls(
+            _df=convert_from_wide_to_long(
+                common_vep_feature_logic(
+                    study_loci_to_annotate=study_loci_to_annotate,
+                    feature_name=cls.feature_name,
+                    **feature_dependency,
+                ),
+                id_vars=("studyLocusId", "geneId"),
+                var_name="featureName",
+                value_name="featureValue",
+            ),
+            _schema=cls.get_schema(),
+        )
+
+
+class VepMaximumNeighbourhoodFeature(L2GFeature):
+    """Maximum functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity."""
+
+    feature_dependency_type = [VariantIndex, GeneIndex]
+    feature_name = "vepMaximumNeighbourhood"
+
+    @classmethod
+    def compute(
+        cls: type[VepMaximumNeighbourhoodFeature],
+        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+        feature_dependency: dict[str, Any],
+    ) -> VepMaximumNeighbourhoodFeature:
+        """Computes the feature.
+
+        Args:
+            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
+            feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information
+
+        Returns:
+            VepMaximumNeighbourhoodFeature: Feature dataset
+        """
+        return cls(
+            _df=convert_from_wide_to_long(
+                common_neighbourhood_vep_feature_logic(
+                    study_loci_to_annotate,
+                    feature_name=cls.feature_name,
+                    **feature_dependency,
+                ),
+                id_vars=("studyLocusId", "geneId"),
+                var_name="featureName",
+                value_name="featureValue",
+            ),
+            _schema=cls.get_schema(),
+        )
+
+
+class VepMeanFeature(L2GFeature):
+    """Average functional consequence score among all variants in a credible set for a studyLocus/gene.
+
+    The mean severity score is weighted by the posterior probability of each variant.
+    """
+
+    feature_dependency_type = VariantIndex
+    feature_name = "vepMean"
+
+    @classmethod
+    def compute(
+        cls: type[VepMeanFeature],
+        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+        feature_dependency: dict[str, Any],
+    ) -> VepMeanFeature:
+        """Computes the feature.
+
+        Args:
+            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
+            feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information
+
+        Returns:
+            VepMeanFeature: Feature dataset
+        """
+        return cls(
+            _df=convert_from_wide_to_long(
+                common_vep_feature_logic(
+                    study_loci_to_annotate=study_loci_to_annotate,
+                    feature_name=cls.feature_name,
+                    **feature_dependency,
+                ),
+                id_vars=("studyLocusId", "geneId"),
+                var_name="featureName",
+                value_name="featureValue",
+            ),
+            _schema=cls.get_schema(),
+        )
+
+
+class VepMeanNeighbourhoodFeature(L2GFeature):
+    """Mean functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity.
+
+    The mean severity score is weighted by the posterior probability of each variant.
+    """
+
+    feature_dependency_type = [VariantIndex, GeneIndex]
+    feature_name = "vepMeanNeighbourhood"
+
+    @classmethod
+    def compute(
+        cls: type[VepMeanNeighbourhoodFeature],
+        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
+        feature_dependency: dict[str, Any],
+    ) -> VepMeanNeighbourhoodFeature:
+        """Computes the feature.
+
+        Args:
+            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
+            feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information
+
+        Returns:
+            VepMeanNeighbourhoodFeature: Feature dataset
+        """
+        return cls(
+            _df=convert_from_wide_to_long(
+                common_neighbourhood_vep_feature_logic(
+                    study_loci_to_annotate,
+                    feature_name=cls.feature_name,
+                    **feature_dependency,
+                ),
+                id_vars=("studyLocusId", "geneId"),
+                var_name="featureName",
+                value_name="featureValue",
+            ),
+            _schema=cls.get_schema(),
+        )
diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py
@@ -1,4 +1,4 @@
-"""Dataset definition for variant annotation."""
+"""Dataset definition for variant index."""
 
 from __future__ import annotations
 

diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py
@@ -12,6 +12,7 @@
 from gentropy.common.utils import access_gcp_secret
 from gentropy.config import LocusToGeneConfig
 from gentropy.dataset.colocalisation import Colocalisation
+from gentropy.dataset.gene_index import GeneIndex
 from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix
 from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
 from gentropy.dataset.l2g_prediction import L2GPrediction
@@ -41,6 +42,7 @@ def __init__(
         variant_index_path: str | None = None,
         colocalisation_path: str | None = None,
         study_index_path: str | None = None,
+        gene_index_path: str | None = None,
         gene_interactions_path: str | None = None,
         predictions_path: str | None = None,
         feature_matrix_path: str | None = None,
@@ -62,6 +64,7 @@ def __init__(
             variant_index_path (str | None): Path to the variant index dataset
             colocalisation_path (str | None): Path to the colocalisation dataset
             study_index_path (str | None): Path to the study index dataset
+            gene_index_path (str | None): Path to the gene index dataset
             gene_interactions_path (str | None): Path to the gene interactions dataset
             predictions_path (str | None): Path to the L2G predictions output dataset
             feature_matrix_path (str | None): Path to the L2G feature matrix output dataset
@@ -108,11 +111,17 @@ def __init__(
             if colocalisation_path
             else None
         )
+        self.gene_index = (
+            GeneIndex.from_parquet(session, gene_index_path, recursiveFileLookup=True)
+            if gene_index_path
+            else None
+        )
         self.features_input_loader = L2GFeatureInputLoader(
             variant_index=self.variant_index,
             coloc=self.coloc,
             studies=self.studies,
             study_locus=self.credible_set,
+            gene_index=self.gene_index,
         )
 
         if run_mode == "predict":

diff --git a/src/gentropy/method/l2g/feature_factory.py b/src/gentropy/method/l2g/feature_factory.py
@@ -33,6 +33,12 @@
     DistanceTssMeanNeighbourhoodFeature,
 )
 from gentropy.dataset.l2g_features.l2g_feature import L2GFeature
+from gentropy.dataset.l2g_features.vep import (
+    VepMaximumFeature,
+    VepMaximumNeighbourhoodFeature,
+    VepMeanFeature,
+    VepMeanNeighbourhoodFeature,
+)
 from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
 from gentropy.dataset.study_locus import StudyLocus
 
@@ -117,6 +123,10 @@ class FeatureFactory:
         "sQtlColocH4MaximumNeighbourhood": SQtlColocH4MaximumNeighbourhoodFeature,
         "tuQtlColocH4Maximum": TuQtlColocH4MaximumFeature,
         "tuQtlColocH4MaximumNeighbourhood": TuQtlColocH4MaximumNeighbourhoodFeature,
+        "vepMean": VepMeanFeature,
+        "vepMeanNeighbourhood": VepMeanNeighbourhoodFeature,
+        "vepMaximum": VepMaximumFeature,
+        "vepMaximumNeighbourhood": VepMaximumNeighbourhoodFeature,
     }
 
     def __init__(