-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #17 from PGScatalog/dev
v0.1.2 release
- Loading branch information
Showing
36 changed files
with
908 additions
and
481 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
name: CI | ||
on: [push] | ||
on: [ push ] | ||
|
||
jobs: | ||
ci: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = '0.1.1' | ||
__version__ = '0.1.2' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import logging | ||
|
||
import polars as pl | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, | ||
dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]: | ||
""" Check overlap between filtered matches and scorefile, remove scores that don't match well and report stats """ | ||
filtered_matches: pl.DataFrame = _filter_matches(matches) | ||
match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset) | ||
.with_columns(pl.col('best_match').fill_null(False))) | ||
|
||
fail_rates: pl.DataFrame = _calculate_match_rate(match_log) | ||
|
||
scores: list[pl.DataFrame] = [] | ||
for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()): | ||
if rate < (1 - min_overlap): | ||
df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [True], 'match_rate': [1 - rate]}) | ||
logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%} variants match)") | ||
scores.append(df.with_column(pl.col('accession').cast(pl.Categorical))) | ||
else: | ||
df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [False], 'match_rate': [1 - rate]}) | ||
logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)") | ||
scores.append(df.with_column(pl.col('accession').cast(pl.Categorical))) | ||
|
||
score_summary: pl.DataFrame = pl.concat(scores) | ||
filtered_scores: pl.DataFrame = (filtered_matches.join(score_summary, on='accession', how='left') | ||
.filter(pl.col('score_pass') == True)) | ||
|
||
return filtered_scores, score_summary | ||
|
||
|
||
def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame: | ||
logger.debug("Calculating overlap between target genome and scoring file") | ||
return (df.groupby('accession') | ||
.agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')]) | ||
.with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate'))) | ||
|
||
|
||
def _filter_matches(df: pl.DataFrame) -> pl.DataFrame: | ||
logger.debug("Filtering variants with exclude flag") | ||
return df.filter((pl.col('best_match') == True) & (pl.col('exclude') == False)) | ||
|
||
|
||
def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame: | ||
return (scorefile.join(matches, on=['row_nr', 'accession'], how='left') | ||
.with_column(pl.lit(dataset).alias('dataset')) | ||
.select(pl.exclude("^.*_right$"))) | ||
|
||
|
||
def _match_keys() -> list[str]: | ||
return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', | ||
'accession', 'effect_type', 'effect_weight'] |
Oops, something went wrong.