Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding finemapping method to studylocusid hash #744

Merged
merged 11 commits into from
Sep 9, 2024
28 changes: 23 additions & 5 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,14 +323,14 @@ def _align_overlapping_tags(
def assign_study_locus_id(
study_id_col: Column,
variant_id_col: Column,
finemapping_col: Column,
finemapping_col: Column = None,
) -> Column:
"""Hashes a column with a variant ID and a study ID to extract a consistent studyLocusId.

project-defiant marked this conversation as resolved.
Show resolved Hide resolved
Args:
study_id_col (Column): column name with a study ID
variant_id_col (Column): column name with a variant ID
finemapping_col (Column): column with fine mapping methodology
finemapping_col (Column, optional): column with fine mapping methodology

Returns:
Column: column with a study locus ID
Expand All @@ -341,13 +341,31 @@ def assign_study_locus_id(
+----------+----------+-----------------+-------------------+
| studyId| variantId|finemappingMethod| study_locus_id|
+----------+----------+-----------------+-------------------+
Daniel-Considine marked this conversation as resolved.
Show resolved Hide resolved
|GCST000001|1_1000_A_C| SuSiE-inf|1553357789130151995|
|GCST000002|1_1000_A_C| pics|-415050894682709184|
|GCST000001|1_1000_A_C| SuSiE-inf|3801266831619496075|
|GCST000002|1_1000_A_C| pics|1581844826999194430|
+----------+----------+-----------------+-------------------+
<BLANKLINE>
>>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C"), ("GCST000002", "1_1000_A_C")]).toDF("studyId", "variantId")
>>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"))).show()
+----------+----------+-------------------+
| studyId| variantId| study_locus_id|
+----------+----------+-------------------+
|GCST000001|1_1000_A_C|1553357789130151995|
|GCST000002|1_1000_A_C|-415050894682709184|
+----------+----------+-------------------+
<BLANKLINE>
"""
variant_id_col = f.coalesce(variant_id_col, f.rand().cast("string"))
return f.xxhash64(study_id_col, variant_id_col).alias("studyLocusId")

if finemapping_col is None:
return f.xxhash64(
study_id_col,
variant_id_col,
).alias("studyLocusId")
else:
return f.xxhash64(study_id_col, variant_id_col, finemapping_col).alias(
"studyLocusId"
)

@classmethod
def calculate_credible_set_log10bf(cls: type[StudyLocus], logbfs: Column) -> Column:
Expand Down
4 changes: 1 addition & 3 deletions src/gentropy/datasource/gwas_catalog/associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,9 +1096,7 @@ def update_study_id(
.drop("subStudyDescription", "updatedStudyId")
).withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
),
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
)
return self

Expand Down
4 changes: 1 addition & 3 deletions src/gentropy/datasource/open_targets/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ def parse_positive_curation(
)
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
),
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
)
.groupBy("studyLocusId", "studyId", "variantId", "geneId")
.agg(f.collect_set("source").alias("sources"))
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/method/locus_breaker_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def locus_breaker(
.cast(t.ArrayType(t.StringType()))
.alias("qualityControls"),
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
f.col("studyId"), f.col("variantId")
).alias("studyLocusId"),
)
),
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/method/window_based_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def clump(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
f.col("studyId"), f.col("variantId")
),
)
# Initialize QC column as array of strings:
Expand Down
4 changes: 1 addition & 3 deletions tests/gentropy/dataset/test_study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,9 +240,7 @@ def test_assign_study_locus_id__null_variant_id(spark: SparkSession) -> None:
schema="studyId: string, variantId: string",
).withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
),
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
)
assert (
df.select("studyLocusId").distinct().count() == 2
Expand Down