feat(variant index): variant description to summarise variant consequ…

…ences in transcripts (#914) * feat: extending the VEP schema * feat(vep parser): adding logic to build variant description based on VEP annotation * fix: remove commented lines * fix: improving consequence to so term mapping * fix: nullified variant descriptions * fix: assessment_flag_column_name type fix * chore: pre-commit auto fixes [...] * feat: adding formatting to distances in description * fix: formatting * fix: variant index schema * fix: conftest for variant index * feat(variant index): normalising assessments of in-silico predictors * feat: adding VEP predictor * fix: variant test config * fix: variant test config * fix: schema type * fix: dropping failing test * fix: variant annotatin * fix: gnomad variant index repartition --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
opentargets · Nov 19, 2024 · 9f9cfd6 · 9f9cfd6
1 parent 4104ce3
commit 9f9cfd6
Show file tree

Hide file tree

Showing 10 changed files with 708 additions and 104 deletions.
diff --git a/src/gentropy/assets/schemas/variant_index.json b/src/gentropy/assets/schemas/variant_index.json
@@ -67,6 +67,12 @@
               "name": "targetId",
               "nullable": true,
               "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "normalisedScore",
+              "nullable": true,
+              "type": "double"
             }
           ],
           "type": "struct"
@@ -192,6 +198,18 @@
               "nullable": true,
               "type": "integer"
             },
+            {
+              "metadata": {},
+              "name": "approvedSymbol",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "biotype",
+              "nullable": true,
+              "type": "string"
+            },
             {
               "metadata": {},
               "name": "transcriptId",
@@ -271,6 +289,12 @@
         },
         "type": "array"
       }
+    },
+    {
+      "metadata": {},
+      "name": "variantDescription",
+      "nullable": true,
+      "type": "string"
     }
   ],
   "type": "struct"

diff --git a/src/gentropy/assets/schemas/vep_json_output.json b/src/gentropy/assets/schemas/vep_json_output.json
@@ -340,6 +340,18 @@
               "nullable": true,
               "type": "string"
             },
+            {
+              "metadata": {},
+              "name": "gene_symbol",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "biotype",
+              "nullable": true,
+              "type": "string"
+            },
             {
               "metadata": {},
               "name": "appris",

diff --git a/src/gentropy/common/spark_helpers.py b/src/gentropy/common/spark_helpers.py
@@ -848,6 +848,7 @@ def get_struct_field_schema(schema: t.StructType, name: str) -> t.DataType:
         raise ValueError("Provided name %s is not present in the schema.", name)
     return matching_fields[0].dataType
 
+
 def calculate_harmonic_sum(input_array: Column) -> Column:
     """Calculate the harmonic sum of an array.
 
@@ -876,9 +877,11 @@ def calculate_harmonic_sum(input_array: Column) -> Column:
     return f.aggregate(
         f.arrays_zip(
             f.sort_array(input_array, False).alias("score"),
-            f.sequence(f.lit(1), f.size(input_array)).alias("pos")
+            f.sequence(f.lit(1), f.size(input_array)).alias("pos"),
         ),
         f.lit(0.0),
         lambda acc, x: acc
-        + x["score"]/f.pow(x["pos"], 2)/f.lit(sum(1 / ((i + 1)**2) for i in range(1000)))
+        + x["score"]
+        / f.pow(x["pos"], 2)
+        / f.lit(sum(1 / ((i + 1) ** 2) for i in range(1000))),
     )
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -476,6 +476,11 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
             "label": "splice_polypyrimidine_tract_variant",
             "score": 0.33,
         },
+        {
+            "id": "SO_0001626",
+            "label": "incomplete_terminal_codon_variant",
+            "score": 0.33,
+        },
         {"id": "SO_0001819", "label": "synonymous_variant", "score": 0.33},
         {
             "id": "SO_0002170",
@@ -499,6 +504,7 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
             "score": 0.0,
         },
         {"id": "SO_0001620", "label": "mature_miRNA_variant", "score": 0.0},
+        {"id": "SO_0001060", "label": "intergenic_variant", "score": 0.0},
     ]
 
     _target_: str = "gentropy.variant_index.VariantIndexStep"

diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py
@@ -298,3 +298,269 @@ def get_loftee(self: VariantIndex) -> DataFrame:
                 "isHighQualityPlof",
             )
         )
+
+
+class InSilicoPredictorNormaliser:
+    """Class to normalise in silico predictor assessments.
+
+    Essentially based on the raw scores, it normalises the scores to a range between -1 and 1, and appends the normalised
+    value to the in silico predictor struct.
+
+    The higher negative values indicate increasingly confident prediction to be a benign variant,
+    while the higher positive values indicate increasingly deleterious predicted effect.
+
+    The point of these operations to make the scores comparable across different in silico predictors.
+    """
+
+    @classmethod
+    def normalise_in_silico_predictors(
+        cls: type[InSilicoPredictorNormaliser],
+        in_silico_predictors: Column,
+    ) -> Column:
+        """Normalise in silico predictors. Appends a normalised score to the in silico predictor struct.
+
+        Args:
+            in_silico_predictors (Column): Column containing in silico predictors (list of structs).
+
+        Returns:
+            Column: Normalised in silico predictors.
+        """
+        return f.transform(
+            in_silico_predictors,
+            lambda predictor: f.struct(
+                # Extracing all existing columns:
+                predictor.method.alias("method"),
+                predictor.assessment.alias("assessment"),
+                predictor.score.alias("score"),
+                predictor.assessmentFlag.alias("assessmentFlag"),
+                predictor.targetId.alias("targetId"),
+                # Normalising the score
+                cls.resolve_predictor_methods(
+                    predictor.score, predictor.method, predictor.assessment
+                ).alias("normalisedScore"),
+            ),
+        )
+
+    @classmethod
+    def resolve_predictor_methods(
+        cls: type[InSilicoPredictorNormaliser],
+        score: Column,
+        method: Column,
+        assessment: Column,
+    ) -> Column:
+        """It takes a score, a method, and an assessment, and returns a normalized score for the in silico predictor.
+
+        Args:
+            score (Column): The raw score from the in silico predictor.
+            method (Column): The method used to generate the score.
+            assessment (Column): The assessment of the score.
+
+        Returns:
+            Column: Normalised score for the in silico predictor.
+        """
+        return (
+            f.when(method == "LOFTEE", cls._normalise_loftee(assessment))
+            .when(method == "SIFT", cls._normalise_sift(score, assessment))
+            .when(method == "PolyPhen", cls._normalise_polyphen(assessment, score))
+            .when(method == "AlphaMissense", cls._normalise_alpha_missense(score))
+            .when(method == "CADD", cls._normalise_cadd(score))
+            .when(method == "Pangolin", cls._normalise_pangolin(score))
+            # The following predictors are not normalised:
+            .when(method == "SpliceAI", score)
+            .when(method == "VEP", score)
+        )
+
+    @staticmethod
+    def _rescaleColumnValue(
+        column: Column,
+        min_value: float,
+        max_value: float,
+        minimum: float = 0.0,
+        maximum: float = 1.0,
+    ) -> Column:
+        """Rescale a column to a new range. Similar to MinMaxScaler in pyspark ML.
+
+        Args:
+            column (Column): Column to rescale.
+            min_value (float): Minimum value of the column.
+            max_value (float): Maximum value of the column.
+            minimum (float, optional): Minimum value of the new range. Defaults to 0.0.
+            maximum (float, optional): Maximum value of the new range. Defaults to 1.0.
+
+        Returns:
+            Column: Rescaled column.
+        """
+        return (column - min_value) / (max_value - min_value) * (
+            maximum - minimum
+        ) + minimum
+
+    @classmethod
+    def _normalise_cadd(
+        cls: type[InSilicoPredictorNormaliser],
+        score: Column,
+    ) -> Column:
+        """Normalise CADD scores.
+
+        Logic: CADD scores are divided into four ranges and scaled accordingly:
+         - 0-10 -> -1-0 (likely benign ~2M)
+         - 10-20 -> 0-0.5 (potentially deleterious ~300k)
+         - 20-30 -> 0.5-0.75 (likely deleterious ~350k)
+         - 30-81 -> 0.75-1 (highly likely deleterious ~86k)
+
+        Args:
+            score (Column): CADD score.
+
+        Returns:
+            Column: Normalised CADD score.
+        """
+        return (
+            f.when(score <= 10, cls._rescaleColumnValue(score, 0, 10, -1.0, 0.0))
+            .when(score <= 20, cls._rescaleColumnValue(score, 10, 20, 0.0, 0.5))
+            .when(score <= 30, cls._rescaleColumnValue(score, 20, 30, 0.5, 0.75))
+            .when(score > 30, cls._rescaleColumnValue(score, 30, 81, 0.75, 1))
+        )
+
+    @classmethod
+    def _normalise_loftee(
+        cls: type[InSilicoPredictorNormaliser],
+        assessment: Column,
+    ) -> Column:
+        """Normalise LOFTEE scores.
+
+        Logic: LOFTEE scores are divided into two categories:
+         - HC (high confidence): 1.0 (~120k)
+         - LC (low confidence): 0.85 (~18k)
+        The normalised score is calculated based on the category the score falls into.
+
+        Args:
+            assessment (Column): LOFTEE assessment.
+
+        Returns:
+            Column: Normalised LOFTEE score.
+        """
+        return f.when(assessment == "HC", f.lit(1)).when(
+            assessment == "LC", f.lit(0.85)
+        )
+
+    @classmethod
+    def _normalise_sift(
+        cls: type[InSilicoPredictorNormaliser],
+        score: Column,
+        assessment: Column,
+    ) -> Column:
+        """Normalise SIFT scores.
+
+        Logic: SIFT scores are divided into four categories:
+         - deleterious and score >= 0.95: 0.75-1
+         - deleterious_low_confidence and score >= 0.95: 0.5-0.75
+         - tolerated_low_confidence and score <= 0.95: 0.25-0.5
+         - tolerated and score <= 0.95: 0-0.25
+
+        Args:
+            score (Column): SIFT score.
+            assessment (Column): SIFT assessment.
+
+        Returns:
+            Column: Normalised SIFT score.
+        """
+        return (
+            f.when(
+                (1 - f.round(score.cast(t.DoubleType()), 2) >= 0.95)
+                & (assessment == "deleterious"),
+                cls._rescaleColumnValue(1 - score, 0.95, 1, 0.5, 1),
+            )
+            .when(
+                (1 - f.round(score.cast(t.DoubleType()), 2) >= 0.95)
+                & (assessment == "deleterious_low_confidence"),
+                cls._rescaleColumnValue(1 - score, 0.95, 1, 0, 0.5),
+            )
+            .when(
+                (1 - f.round(score.cast(t.DoubleType()), 2) <= 0.95)
+                & (assessment == "tolerated_low_confidence"),
+                cls._rescaleColumnValue(1 - score, 0, 0.95, -0.5, 0.0),
+            )
+            .when(
+                (1 - f.round(score.cast(t.DoubleType()), 2) <= 0.95)
+                & (assessment == "tolerated"),
+                cls._rescaleColumnValue(1 - score, 0, 0.95, -1, -0.5),
+            )
+        )
+
+    @classmethod
+    def _normalise_polyphen(
+        cls: type[InSilicoPredictorNormaliser],
+        assessment: Column,
+        score: Column,
+    ) -> Column:
+        """Normalise PolyPhen scores.
+
+        Logic: PolyPhen scores are divided into three categories:
+         - benign: 0-0.446: -1--0.25
+         - possibly_damaging: 0.446-0.908: -0.25-0.25
+         - probably_damaging: 0.908-1: 0.25-1
+         - if assessment is unknown: None
+
+        Args:
+            assessment (Column): PolyPhen assessment.
+            score (Column): PolyPhen score.
+
+        Returns:
+            Column: Normalised PolyPhen score.
+        """
+        return (
+            f.when(assessment == "unknown", f.lit(None).cast(t.DoubleType()))
+            .when(score <= 0.446, cls._rescaleColumnValue(score, 0, 0.446, -1.0, -0.25))
+            .when(
+                score <= 0.908,
+                cls._rescaleColumnValue(score, 0.446, 0.908, -0.25, 0.25),
+            )
+            .when(score > 0.908, cls._rescaleColumnValue(score, 0.908, 1.0, 0.25, 1.0))
+        )
+
+    @classmethod
+    def _normalise_alpha_missense(
+        cls: type[InSilicoPredictorNormaliser],
+        score: Column,
+    ) -> Column:
+        """Normalise AlphaMissense scores.
+
+        Logic: AlphaMissense scores are divided into three categories:
+         - 0-0.06: -1.0--0.25
+         - 0.06-0.77: -0.25-0.25
+         - 0.77-1: 0.25-1
+
+        Args:
+            score (Column): AlphaMissense score.
+
+        Returns:
+            Column: Normalised AlphaMissense score.
+        """
+        return (
+            f.when(score < 0.06, cls._rescaleColumnValue(score, 0, 0.06, -1.0, -0.25))
+            .when(score < 0.77, cls._rescaleColumnValue(score, 0.06, 0.77, -0.25, 0.25))
+            .when(score >= 0.77, cls._rescaleColumnValue(score, 0.77, 1, 0.25, 1))
+        )
+
+    @classmethod
+    def _normalise_pangolin(
+        cls: type[InSilicoPredictorNormaliser],
+        score: Column,
+    ) -> Column:
+        """Normalise Pangolin scores.
+
+        Logic: Pangolin scores are divided into two categories:
+            - 0-0.14: 0-0.25
+            - 0.14-1: 0.75-1
+
+        Args:
+            score (Column): Pangolin score.
+
+        Returns:
+            Column: Normalised Pangolin score.
+        """
+        return f.when(
+            f.abs(score) > 0.14, cls._rescaleColumnValue(f.abs(score), 0.14, 1, 0.5, 1)
+        ).when(
+            f.abs(score) <= 0.14,
+            cls._rescaleColumnValue(f.abs(score), 0, 0.14, 0.0, 0.5),
+        )