Skip to content

Commit

Permalink
Merge pull request #17 from PGScatalog/dev
Browse files Browse the repository at this point in the history
v0.1.2 release
  • Loading branch information
smlmbrt authored Sep 15, 2022
2 parents c1b5290 + ce8b4be commit 130b103
Show file tree
Hide file tree
Showing 36 changed files with 908 additions and 481 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: CI
on: [push]
on: [ push ]

jobs:
ci:
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ FROM python:3.10

WORKDIR /opt/

COPY --from=builder /app/dist/pgscatalog_utils-0.1.1-py3-none-any.whl .
COPY --from=builder /app/dist/pgscatalog_utils-0.1.2-py3-none-any.whl .

RUN pip install pgscatalog_utils-0.1.1-py3-none-any.whl
RUN pip install pgscatalog_utils-0.1.2-py3-none-any.whl

RUN apt-get update && apt-get install -y sqlite3
73 changes: 56 additions & 17 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import pytest
from unittest.mock import patch
from pgscatalog_utils.download.download_scorefile import download_scorefile
import glob
import os
from unittest.mock import patch

import pandas as pd
import polars as pl
import pytest
import requests as req
from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
from pysqlar import SQLiteArchive
import pandas as pd
import glob

from pgscatalog_utils.download.download_scorefile import download_scorefile
from pgscatalog_utils.match.preprocess import complement_valid_alleles
from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -66,7 +70,7 @@ def mini_score_path(tmp_path_factory):
def mini_scorefile(mini_score_path, tmp_path_factory):
# The mini scorefile overlaps well with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]
args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()
Expand All @@ -78,7 +82,7 @@ def mini_scorefile(mini_score_path, tmp_path_factory):
def combined_scorefile(scorefiles, tmp_path_factory):
# The combined scorefile overlaps poorly with cineca synthetic subset
out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()
Expand Down Expand Up @@ -111,10 +115,11 @@ def chain_files(db, tmp_path_factory):


@pytest.fixture(scope="session")
def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory):
def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['--liftover', '-c', chain_files, '-t', 'GRCh38',
'-m', '0.8'] + ['-o', str(out_path.resolve())]
args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t',
'GRCh38',
'-m', '0.8'] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()
Expand All @@ -123,15 +128,11 @@ def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory):


@pytest.fixture(scope="session")
def hg38_coords(tmp_path_factory):
out_path = tmp_path_factory.mktemp("dummy") / "hg38.txt"
def hg38_coords():
d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]}
df = pd.DataFrame(d)
with open(out_path, 'w') as f:
f.write('#genome_build=GRCh38\n')
df.to_csv(out_path, mode='a', index=False)
df['filename'] = str(out_path.resolve())
df['accession'] = 'dummy'
df['genome_build'] = 'GRCh38'
return df


Expand All @@ -142,6 +143,44 @@ def hg19_coords(hg38_coords):
return pd.DataFrame(d)


@pytest.fixture(scope='session')
def small_flipped_scorefile(small_scorefile):
# simulate a scorefile on the wrong strand
return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
.drop(['effect_allele', 'other_allele'])
.rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
.pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))


@pytest.fixture(scope='session')
def small_target():
return pl.DataFrame({"#CHROM": [1, 2, 3],
"POS": [1, 2, 3],
"REF": ["A", "T", "T"],
"ALT": ["C", "A", "G"],
"ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
"is_multiallelic": [False, False, False]})


@pytest.fixture(scope='session')
def small_scorefile():
df = pl.DataFrame({"accession": ["test", "test", "test"],
"row_nr": [1, 2, 3],
"chr_name": [1, 2, 3],
"chr_position": [1, 2, 3],
"effect_allele": ["A", "A", "G"],
"other_allele": ["C", "T", "T"],
"effect_weight": [1, 2, 3],
"effect_type": ["additive", "additive", "additive"]})

return complement_valid_alleles(df, ["effect_allele", "other_allele"])


@pytest.fixture(scope='session')
def small_scorefile_no_oa(small_scorefile):
return small_scorefile.with_column(pl.lit(None).alias('other_allele'))


def _get_timeout(url):
try:
return req.get(url, timeout=5)
Expand Down
2 changes: 1 addition & 1 deletion pgscatalog_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.1'
__version__ = '0.1.2'
2 changes: 1 addition & 1 deletion pgscatalog_utils/download/download_scorefile.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def _parse_args(args=None) -> argparse.Namespace:
parser.add_argument('-t', '--efo', dest='efo', nargs='+',
help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
parser.add_argument('-b', '--build', dest='build',
parser.add_argument('-b', '--build', dest='build', choices=['GRCh37', 'GRCh38'],
help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
parser.add_argument('-o', '--outdir', dest='outdir', required=True,
default='scores/',
Expand Down
6 changes: 2 additions & 4 deletions pgscatalog_utils/download/publication.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import requests
import logging
from functools import reduce

import requests

logger = logging.getLogger(__name__)


Expand All @@ -17,6 +18,3 @@ def query_publication(pgp: str) -> list[str]:
pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids')
logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}")
return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values()))



10 changes: 6 additions & 4 deletions pgscatalog_utils/download/score.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import requests
import logging
import jq
import sys

import jq
import requests

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -36,7 +37,7 @@ def query_score(pgs_id: list[str]) -> dict:

def _chunker(pgs: list[str]):
size = 50 # /rest/score/{pgs_id} limit when searching multiple IDs
return(pgs[pos: pos + size] for pos in range(0, len(pgs), size))
return (pgs[pos: pos + size] for pos in range(0, len(pgs), size))


def _parse_json_query(json: dict, build: str | None) -> dict[str, str]:
Expand All @@ -53,5 +54,6 @@ def _extract_ftp_url(json: list[dict], build: str | None) -> dict[str, str]:
result: list[str] = jq.compile(f'[.results][][].ftp_scoring_file').input(
json).all()
else:
result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all()
result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(
json).all()
return dict(zip(id, [x.replace('https', 'ftp') for x in result]))
3 changes: 2 additions & 1 deletion pgscatalog_utils/download/trait.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import requests
import logging
from functools import reduce

import requests

logger = logging.getLogger(__name__)


Expand Down
2 changes: 1 addition & 1 deletion pgscatalog_utils/log_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ def set_logging_level(verbose: bool):
else:
logging.basicConfig(level=logging.WARNING,
format=log_fmt,
datefmt='%Y-%m-%d %H:%M:%S')
datefmt='%Y-%m-%d %H:%M:%S')
55 changes: 55 additions & 0 deletions pgscatalog_utils/match/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import logging

import polars as pl

logger = logging.getLogger(__name__)


def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float,
dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
""" Check overlap between filtered matches and scorefile, remove scores that don't match well and report stats """
filtered_matches: pl.DataFrame = _filter_matches(matches)
match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset)
.with_columns(pl.col('best_match').fill_null(False)))

fail_rates: pl.DataFrame = _calculate_match_rate(match_log)

scores: list[pl.DataFrame] = []
for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()):
if rate < (1 - min_overlap):
df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [True], 'match_rate': [1 - rate]})
logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%} variants match)")
scores.append(df.with_column(pl.col('accession').cast(pl.Categorical)))
else:
df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [False], 'match_rate': [1 - rate]})
logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)")
scores.append(df.with_column(pl.col('accession').cast(pl.Categorical)))

score_summary: pl.DataFrame = pl.concat(scores)
filtered_scores: pl.DataFrame = (filtered_matches.join(score_summary, on='accession', how='left')
.filter(pl.col('score_pass') == True))

return filtered_scores, score_summary


def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame:
logger.debug("Calculating overlap between target genome and scoring file")
return (df.groupby('accession')
.agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')])
.with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate')))


def _filter_matches(df: pl.DataFrame) -> pl.DataFrame:
logger.debug("Filtering variants with exclude flag")
return df.filter((pl.col('best_match') == True) & (pl.col('exclude') == False))


def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame:
return (scorefile.join(matches, on=['row_nr', 'accession'], how='left')
.with_column(pl.lit(dataset).alias('dataset'))
.select(pl.exclude("^.*_right$")))


def _match_keys() -> list[str]:
return ['chr_name', 'chr_position', 'effect_allele', 'other_allele',
'accession', 'effect_type', 'effect_weight']
Loading

0 comments on commit 130b103

Please sign in to comment.