-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(l2g): implement variant consequence features from VEP #805
Changes from all commits
8fa0d72
5742436
4e8903a
8b233e4
46be5de
70e9adc
6919997
418c47b
cdbc361
62c13f2
4ccd5d0
f892d23
320f430
24e476a
604e8ba
7344ef0
20e74d8
48e6a30
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
--- | ||
title: From VEP | ||
--- | ||
|
||
## List of features | ||
|
||
::: gentropy.dataset.l2g_features.vep.VepMeanFeature | ||
::: gentropy.dataset.l2g_features.vep.VepMeanNeighbourhoodFeature | ||
::: gentropy.dataset.l2g_features.vep.VepMaximumFeature | ||
::: gentropy.dataset.l2g_features.vep.VepMaximumNeighbourhoodFeature | ||
|
||
## Common logic | ||
|
||
::: gentropy.dataset.l2g_features.vep.common_vep_feature_logic | ||
::: gentropy.dataset.l2g_features.vep.common_neighbourhood_vep_feature_logic |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
"""Collection of methods that extract distance features from the variant index dataset.""" | ||
|
||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING, Any | ||
|
||
import pyspark.sql.functions as f | ||
from pyspark.sql import Window | ||
|
||
from gentropy.common.spark_helpers import convert_from_wide_to_long | ||
from gentropy.dataset.gene_index import GeneIndex | ||
from gentropy.dataset.l2g_features.l2g_feature import L2GFeature | ||
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard | ||
from gentropy.dataset.study_locus import StudyLocus | ||
from gentropy.dataset.variant_index import VariantIndex | ||
|
||
if TYPE_CHECKING: | ||
from pyspark.sql import DataFrame | ||
|
||
|
||
def common_vep_feature_logic( | ||
study_loci_to_annotate: L2GGoldStandard | StudyLocus, | ||
*, | ||
variant_index: VariantIndex, | ||
feature_name: str, | ||
) -> DataFrame: | ||
"""Extracts variant severity score computed from VEP. | ||
|
||
Args: | ||
study_loci_to_annotate (L2GGoldStandard | StudyLocus): The dataset containing study loci that will be used for annotation | ||
variant_index (VariantIndex): The dataset containing functional consequence information | ||
feature_name (str): The name of the feature | ||
|
||
Returns: | ||
DataFrame: Feature dataset | ||
""" | ||
# Variant/Target/Severity dataframe | ||
consequences_dataset = variant_index.df.withColumn( | ||
"transcriptConsequence", f.explode("transcriptConsequences") | ||
).select( | ||
"variantId", | ||
f.col("transcriptConsequence.targetId").alias("geneId"), | ||
f.col("transcriptConsequence.consequenceScore").alias("severityScore"), | ||
) | ||
if isinstance(study_loci_to_annotate, StudyLocus): | ||
variants_df = ( | ||
study_loci_to_annotate.df.withColumn( | ||
"variantInLocus", f.explode_outer("locus") | ||
) | ||
.select( | ||
"studyLocusId", | ||
f.col("variantInLocus.variantId").alias("variantId"), | ||
f.col("variantInLocus.posteriorProbability").alias( | ||
"posteriorProbability" | ||
), | ||
) | ||
.join(consequences_dataset, "variantId") | ||
) | ||
elif isinstance(study_loci_to_annotate, L2GGoldStandard): | ||
variants_df = study_loci_to_annotate.df.select( | ||
"studyLocusId", "variantId", f.lit(1.0).alias("posteriorProbability") | ||
).join(consequences_dataset, "variantId") | ||
|
||
if "Maximum" in feature_name: | ||
agg_expr = f.max("severityScore") | ||
elif "Mean" in feature_name: | ||
variants_df = variants_df.withColumn( | ||
"weightedScore", f.col("severityScore") * f.col("posteriorProbability") | ||
) | ||
agg_expr = f.mean("weightedScore") | ||
return variants_df.groupBy("studyLocusId", "geneId").agg( | ||
agg_expr.alias(feature_name) | ||
) | ||
|
||
|
||
def common_neighbourhood_vep_feature_logic( | ||
study_loci_to_annotate: StudyLocus | L2GGoldStandard, | ||
*, | ||
variant_index: VariantIndex, | ||
gene_index: GeneIndex, | ||
feature_name: str, | ||
) -> DataFrame: | ||
"""Extracts variant severity score computed from VEP for any gene, based on what is the mean score for protein coding genes that are nearby the locus. | ||
|
||
Args: | ||
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation | ||
variant_index (VariantIndex): The dataset containing functional consequence information | ||
gene_index (GeneIndex): The dataset containing the gene biotype | ||
feature_name (str): The name of the feature | ||
|
||
Returns: | ||
DataFrame: Feature dataset | ||
""" | ||
local_feature_name = feature_name.replace("Neighbourhood", "") | ||
# First compute mean distances to a gene | ||
local_metric = common_vep_feature_logic( | ||
study_loci_to_annotate, | ||
feature_name=local_feature_name, | ||
variant_index=variant_index, | ||
) | ||
return ( | ||
# Then compute mean distance in the vicinity (feature will be the same for any gene associated with a studyLocus) | ||
local_metric.join( | ||
# Bring gene classification | ||
gene_index.df.select("geneId", "biotype"), | ||
"geneId", | ||
"inner", | ||
) | ||
.withColumn( | ||
"regional_metric", | ||
f.coalesce( | ||
# Calculate mean based on protein coding genes | ||
f.mean( | ||
f.when( | ||
f.col("biotype") == "protein_coding", f.col(local_feature_name) | ||
) | ||
).over(Window.partitionBy("studyLocusId")), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do we assign the genes for this studyLocusId? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Genes come from VEP. It's the genes of the transcript that the variant overlaps with in a 500kb window There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For variantId - yes, but we have many varintIds in one studyLocusId. So just to be sure - we use the union across al varaintIds? And if varintId is not assigned to geneId we assume its VEP score is 0? |
||
# Default to 0 if there are no protein coding genes | ||
f.lit(0), | ||
), | ||
) | ||
.withColumn(feature_name, f.col(local_feature_name) - f.col("regional_metric")) | ||
.drop("regional_metric", local_feature_name, "biotype") | ||
) | ||
|
||
|
||
class VepMaximumFeature(L2GFeature): | ||
"""Maximum functional consequence score among all variants in a credible set for a studyLocus/gene.""" | ||
|
||
feature_dependency_type = VariantIndex | ||
feature_name = "vepMaximum" | ||
|
||
@classmethod | ||
def compute( | ||
cls: type[VepMaximumFeature], | ||
study_loci_to_annotate: StudyLocus | L2GGoldStandard, | ||
feature_dependency: dict[str, Any], | ||
) -> VepMaximumFeature: | ||
"""Computes the feature. | ||
|
||
Args: | ||
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation | ||
feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information | ||
|
||
Returns: | ||
VepMaximumFeature: Feature dataset | ||
""" | ||
return cls( | ||
_df=convert_from_wide_to_long( | ||
common_vep_feature_logic( | ||
study_loci_to_annotate=study_loci_to_annotate, | ||
feature_name=cls.feature_name, | ||
**feature_dependency, | ||
), | ||
id_vars=("studyLocusId", "geneId"), | ||
var_name="featureName", | ||
value_name="featureValue", | ||
), | ||
_schema=cls.get_schema(), | ||
) | ||
|
||
|
||
class VepMaximumNeighbourhoodFeature(L2GFeature): | ||
"""Maximum functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity.""" | ||
|
||
feature_dependency_type = [VariantIndex, GeneIndex] | ||
feature_name = "vepMaximumNeighbourhood" | ||
|
||
@classmethod | ||
def compute( | ||
cls: type[VepMaximumNeighbourhoodFeature], | ||
study_loci_to_annotate: StudyLocus | L2GGoldStandard, | ||
feature_dependency: dict[str, Any], | ||
) -> VepMaximumNeighbourhoodFeature: | ||
"""Computes the feature. | ||
|
||
Args: | ||
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation | ||
feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information | ||
|
||
Returns: | ||
VepMaximumNeighbourhoodFeature: Feature dataset | ||
""" | ||
return cls( | ||
_df=convert_from_wide_to_long( | ||
common_neighbourhood_vep_feature_logic( | ||
study_loci_to_annotate, | ||
feature_name=cls.feature_name, | ||
**feature_dependency, | ||
), | ||
id_vars=("studyLocusId", "geneId"), | ||
var_name="featureName", | ||
value_name="featureValue", | ||
), | ||
_schema=cls.get_schema(), | ||
) | ||
|
||
|
||
class VepMeanFeature(L2GFeature): | ||
"""Average functional consequence score among all variants in a credible set for a studyLocus/gene. | ||
|
||
The mean severity score is weighted by the posterior probability of each variant. | ||
""" | ||
|
||
feature_dependency_type = VariantIndex | ||
feature_name = "vepMean" | ||
|
||
@classmethod | ||
def compute( | ||
cls: type[VepMeanFeature], | ||
study_loci_to_annotate: StudyLocus | L2GGoldStandard, | ||
feature_dependency: dict[str, Any], | ||
) -> VepMeanFeature: | ||
"""Computes the feature. | ||
|
||
Args: | ||
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation | ||
feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information | ||
|
||
Returns: | ||
VepMeanFeature: Feature dataset | ||
""" | ||
return cls( | ||
_df=convert_from_wide_to_long( | ||
common_vep_feature_logic( | ||
study_loci_to_annotate=study_loci_to_annotate, | ||
feature_name=cls.feature_name, | ||
**feature_dependency, | ||
), | ||
id_vars=("studyLocusId", "geneId"), | ||
var_name="featureName", | ||
value_name="featureValue", | ||
), | ||
_schema=cls.get_schema(), | ||
) | ||
|
||
|
||
class VepMeanNeighbourhoodFeature(L2GFeature): | ||
"""Mean functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity. | ||
|
||
The mean severity score is weighted by the posterior probability of each variant. | ||
""" | ||
|
||
feature_dependency_type = [VariantIndex, GeneIndex] | ||
feature_name = "vepMeanNeighbourhood" | ||
|
||
@classmethod | ||
def compute( | ||
cls: type[VepMeanNeighbourhoodFeature], | ||
study_loci_to_annotate: StudyLocus | L2GGoldStandard, | ||
feature_dependency: dict[str, Any], | ||
) -> VepMeanNeighbourhoodFeature: | ||
"""Computes the feature. | ||
|
||
Args: | ||
study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation | ||
feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information | ||
|
||
Returns: | ||
VepMeanNeighbourhoodFeature: Feature dataset | ||
""" | ||
return cls( | ||
_df=convert_from_wide_to_long( | ||
common_neighbourhood_vep_feature_logic( | ||
study_loci_to_annotate, | ||
feature_name=cls.feature_name, | ||
**feature_dependency, | ||
), | ||
id_vars=("studyLocusId", "geneId"), | ||
var_name="featureName", | ||
value_name="featureValue", | ||
), | ||
_schema=cls.get_schema(), | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we still have L2GGoldStandard? Isn't it going to be removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As agreed offline, we'll do that once we tackle opentargets/issues#3525