Skip to content

Commit

Permalink
feat(variant index): variant description to summarise variant consequ…
Browse files Browse the repository at this point in the history
…ences in transcripts (#914)

* feat: extending the VEP schema

* feat(vep parser): adding logic to build variant description based on VEP annotation

* fix: remove commented lines

* fix: improving consequence to so term mapping

* fix: nullified variant descriptions

* fix: assessment_flag_column_name type fix

* chore: pre-commit auto fixes [...]

* feat: adding formatting to distances in description

* fix: formatting

* fix: variant index schema

* fix: conftest for variant index

* feat(variant index): normalising assessments of in-silico predictors

* feat: adding VEP predictor

* fix: variant test config

* fix: variant test config

* fix: schema type

* fix: dropping failing test

* fix: variant annotatin

* fix: gnomad variant index repartition

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
DSuveges and pre-commit-ci[bot] authored Nov 19, 2024
1 parent 4104ce3 commit 9f9cfd6
Show file tree
Hide file tree
Showing 10 changed files with 708 additions and 104 deletions.
24 changes: 24 additions & 0 deletions src/gentropy/assets/schemas/variant_index.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@
"name": "targetId",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "normalisedScore",
"nullable": true,
"type": "double"
}
],
"type": "struct"
Expand Down Expand Up @@ -192,6 +198,18 @@
"nullable": true,
"type": "integer"
},
{
"metadata": {},
"name": "approvedSymbol",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "biotype",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "transcriptId",
Expand Down Expand Up @@ -271,6 +289,12 @@
},
"type": "array"
}
},
{
"metadata": {},
"name": "variantDescription",
"nullable": true,
"type": "string"
}
],
"type": "struct"
Expand Down
12 changes: 12 additions & 0 deletions src/gentropy/assets/schemas/vep_json_output.json
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,18 @@
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "gene_symbol",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "biotype",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "appris",
Expand Down
7 changes: 5 additions & 2 deletions src/gentropy/common/spark_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,7 @@ def get_struct_field_schema(schema: t.StructType, name: str) -> t.DataType:
raise ValueError("Provided name %s is not present in the schema.", name)
return matching_fields[0].dataType


def calculate_harmonic_sum(input_array: Column) -> Column:
"""Calculate the harmonic sum of an array.
Expand Down Expand Up @@ -876,9 +877,11 @@ def calculate_harmonic_sum(input_array: Column) -> Column:
return f.aggregate(
f.arrays_zip(
f.sort_array(input_array, False).alias("score"),
f.sequence(f.lit(1), f.size(input_array)).alias("pos")
f.sequence(f.lit(1), f.size(input_array)).alias("pos"),
),
f.lit(0.0),
lambda acc, x: acc
+ x["score"]/f.pow(x["pos"], 2)/f.lit(sum(1 / ((i + 1)**2) for i in range(1000)))
+ x["score"]
/ f.pow(x["pos"], 2)
/ f.lit(sum(1 / ((i + 1) ** 2) for i in range(1000))),
)
6 changes: 6 additions & 0 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,11 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
"label": "splice_polypyrimidine_tract_variant",
"score": 0.33,
},
{
"id": "SO_0001626",
"label": "incomplete_terminal_codon_variant",
"score": 0.33,
},
{"id": "SO_0001819", "label": "synonymous_variant", "score": 0.33},
{
"id": "SO_0002170",
Expand All @@ -499,6 +504,7 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
"score": 0.0,
},
{"id": "SO_0001620", "label": "mature_miRNA_variant", "score": 0.0},
{"id": "SO_0001060", "label": "intergenic_variant", "score": 0.0},
]

_target_: str = "gentropy.variant_index.VariantIndexStep"
Expand Down
266 changes: 266 additions & 0 deletions src/gentropy/dataset/variant_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,269 @@ def get_loftee(self: VariantIndex) -> DataFrame:
"isHighQualityPlof",
)
)


class InSilicoPredictorNormaliser:
"""Class to normalise in silico predictor assessments.
Essentially based on the raw scores, it normalises the scores to a range between -1 and 1, and appends the normalised
value to the in silico predictor struct.
The higher negative values indicate increasingly confident prediction to be a benign variant,
while the higher positive values indicate increasingly deleterious predicted effect.
The point of these operations to make the scores comparable across different in silico predictors.
"""

@classmethod
def normalise_in_silico_predictors(
cls: type[InSilicoPredictorNormaliser],
in_silico_predictors: Column,
) -> Column:
"""Normalise in silico predictors. Appends a normalised score to the in silico predictor struct.
Args:
in_silico_predictors (Column): Column containing in silico predictors (list of structs).
Returns:
Column: Normalised in silico predictors.
"""
return f.transform(
in_silico_predictors,
lambda predictor: f.struct(
# Extracing all existing columns:
predictor.method.alias("method"),
predictor.assessment.alias("assessment"),
predictor.score.alias("score"),
predictor.assessmentFlag.alias("assessmentFlag"),
predictor.targetId.alias("targetId"),
# Normalising the score
cls.resolve_predictor_methods(
predictor.score, predictor.method, predictor.assessment
).alias("normalisedScore"),
),
)

@classmethod
def resolve_predictor_methods(
cls: type[InSilicoPredictorNormaliser],
score: Column,
method: Column,
assessment: Column,
) -> Column:
"""It takes a score, a method, and an assessment, and returns a normalized score for the in silico predictor.
Args:
score (Column): The raw score from the in silico predictor.
method (Column): The method used to generate the score.
assessment (Column): The assessment of the score.
Returns:
Column: Normalised score for the in silico predictor.
"""
return (
f.when(method == "LOFTEE", cls._normalise_loftee(assessment))
.when(method == "SIFT", cls._normalise_sift(score, assessment))
.when(method == "PolyPhen", cls._normalise_polyphen(assessment, score))
.when(method == "AlphaMissense", cls._normalise_alpha_missense(score))
.when(method == "CADD", cls._normalise_cadd(score))
.when(method == "Pangolin", cls._normalise_pangolin(score))
# The following predictors are not normalised:
.when(method == "SpliceAI", score)
.when(method == "VEP", score)
)

@staticmethod
def _rescaleColumnValue(
column: Column,
min_value: float,
max_value: float,
minimum: float = 0.0,
maximum: float = 1.0,
) -> Column:
"""Rescale a column to a new range. Similar to MinMaxScaler in pyspark ML.
Args:
column (Column): Column to rescale.
min_value (float): Minimum value of the column.
max_value (float): Maximum value of the column.
minimum (float, optional): Minimum value of the new range. Defaults to 0.0.
maximum (float, optional): Maximum value of the new range. Defaults to 1.0.
Returns:
Column: Rescaled column.
"""
return (column - min_value) / (max_value - min_value) * (
maximum - minimum
) + minimum

@classmethod
def _normalise_cadd(
cls: type[InSilicoPredictorNormaliser],
score: Column,
) -> Column:
"""Normalise CADD scores.
Logic: CADD scores are divided into four ranges and scaled accordingly:
- 0-10 -> -1-0 (likely benign ~2M)
- 10-20 -> 0-0.5 (potentially deleterious ~300k)
- 20-30 -> 0.5-0.75 (likely deleterious ~350k)
- 30-81 -> 0.75-1 (highly likely deleterious ~86k)
Args:
score (Column): CADD score.
Returns:
Column: Normalised CADD score.
"""
return (
f.when(score <= 10, cls._rescaleColumnValue(score, 0, 10, -1.0, 0.0))
.when(score <= 20, cls._rescaleColumnValue(score, 10, 20, 0.0, 0.5))
.when(score <= 30, cls._rescaleColumnValue(score, 20, 30, 0.5, 0.75))
.when(score > 30, cls._rescaleColumnValue(score, 30, 81, 0.75, 1))
)

@classmethod
def _normalise_loftee(
cls: type[InSilicoPredictorNormaliser],
assessment: Column,
) -> Column:
"""Normalise LOFTEE scores.
Logic: LOFTEE scores are divided into two categories:
- HC (high confidence): 1.0 (~120k)
- LC (low confidence): 0.85 (~18k)
The normalised score is calculated based on the category the score falls into.
Args:
assessment (Column): LOFTEE assessment.
Returns:
Column: Normalised LOFTEE score.
"""
return f.when(assessment == "HC", f.lit(1)).when(
assessment == "LC", f.lit(0.85)
)

@classmethod
def _normalise_sift(
cls: type[InSilicoPredictorNormaliser],
score: Column,
assessment: Column,
) -> Column:
"""Normalise SIFT scores.
Logic: SIFT scores are divided into four categories:
- deleterious and score >= 0.95: 0.75-1
- deleterious_low_confidence and score >= 0.95: 0.5-0.75
- tolerated_low_confidence and score <= 0.95: 0.25-0.5
- tolerated and score <= 0.95: 0-0.25
Args:
score (Column): SIFT score.
assessment (Column): SIFT assessment.
Returns:
Column: Normalised SIFT score.
"""
return (
f.when(
(1 - f.round(score.cast(t.DoubleType()), 2) >= 0.95)
& (assessment == "deleterious"),
cls._rescaleColumnValue(1 - score, 0.95, 1, 0.5, 1),
)
.when(
(1 - f.round(score.cast(t.DoubleType()), 2) >= 0.95)
& (assessment == "deleterious_low_confidence"),
cls._rescaleColumnValue(1 - score, 0.95, 1, 0, 0.5),
)
.when(
(1 - f.round(score.cast(t.DoubleType()), 2) <= 0.95)
& (assessment == "tolerated_low_confidence"),
cls._rescaleColumnValue(1 - score, 0, 0.95, -0.5, 0.0),
)
.when(
(1 - f.round(score.cast(t.DoubleType()), 2) <= 0.95)
& (assessment == "tolerated"),
cls._rescaleColumnValue(1 - score, 0, 0.95, -1, -0.5),
)
)

@classmethod
def _normalise_polyphen(
cls: type[InSilicoPredictorNormaliser],
assessment: Column,
score: Column,
) -> Column:
"""Normalise PolyPhen scores.
Logic: PolyPhen scores are divided into three categories:
- benign: 0-0.446: -1--0.25
- possibly_damaging: 0.446-0.908: -0.25-0.25
- probably_damaging: 0.908-1: 0.25-1
- if assessment is unknown: None
Args:
assessment (Column): PolyPhen assessment.
score (Column): PolyPhen score.
Returns:
Column: Normalised PolyPhen score.
"""
return (
f.when(assessment == "unknown", f.lit(None).cast(t.DoubleType()))
.when(score <= 0.446, cls._rescaleColumnValue(score, 0, 0.446, -1.0, -0.25))
.when(
score <= 0.908,
cls._rescaleColumnValue(score, 0.446, 0.908, -0.25, 0.25),
)
.when(score > 0.908, cls._rescaleColumnValue(score, 0.908, 1.0, 0.25, 1.0))
)

@classmethod
def _normalise_alpha_missense(
cls: type[InSilicoPredictorNormaliser],
score: Column,
) -> Column:
"""Normalise AlphaMissense scores.
Logic: AlphaMissense scores are divided into three categories:
- 0-0.06: -1.0--0.25
- 0.06-0.77: -0.25-0.25
- 0.77-1: 0.25-1
Args:
score (Column): AlphaMissense score.
Returns:
Column: Normalised AlphaMissense score.
"""
return (
f.when(score < 0.06, cls._rescaleColumnValue(score, 0, 0.06, -1.0, -0.25))
.when(score < 0.77, cls._rescaleColumnValue(score, 0.06, 0.77, -0.25, 0.25))
.when(score >= 0.77, cls._rescaleColumnValue(score, 0.77, 1, 0.25, 1))
)

@classmethod
def _normalise_pangolin(
cls: type[InSilicoPredictorNormaliser],
score: Column,
) -> Column:
"""Normalise Pangolin scores.
Logic: Pangolin scores are divided into two categories:
- 0-0.14: 0-0.25
- 0.14-1: 0.75-1
Args:
score (Column): Pangolin score.
Returns:
Column: Normalised Pangolin score.
"""
return f.when(
f.abs(score) > 0.14, cls._rescaleColumnValue(f.abs(score), 0.14, 1, 0.5, 1)
).when(
f.abs(score) <= 0.14,
cls._rescaleColumnValue(f.abs(score), 0, 0.14, 0.0, 0.5),
)
Loading

0 comments on commit 9f9cfd6

Please sign in to comment.