Skip to content

Commit

Permalink
Merge pull request #9 from PGScatalog/dev
Browse files Browse the repository at this point in the history
Merge dev for release v0.1.1
  • Loading branch information
smlmbrt authored Aug 23, 2022
2 parents a6de70c + 50999ac commit c1b5290
Show file tree
Hide file tree
Showing 24 changed files with 809 additions and 331 deletions.
46 changes: 40 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,50 @@ of `combine_scorefile` to produce scoring files for plink 2
$ pip install pgscatalog-utils
```

Or clone the repo:

```
$ git clone https://github.com/PGScatalog/pgscatalog_utils.git
```

## Quickstart

```
$ download_scorefiles -i PGS000922 PGS001229 -o . -b GRCh37
$ combine_scorefiles -s PGS*.txt.gz -o combined.txt
$ match_variants -s combined.txt -t <example.pvar> --min_overlap 0.75 --outdir .
```

More details are available using the `--help` parameter.

## Install from source

Requirements:

- python 3.10
- [poetry](https://python-poetry.org)

```
$ git clone https://github.com/PGScatalog/pgscatalog_utils.git
$ cd pgscatalog_utils
$ poetry install
$ poetry build
$ pip install --user dist/*.whl
```

## Credits

The `pgscatalog_utils` package is developed as part of the **Polygenic Score (PGS) Catalog**
([www.PGSCatalog.org](https://www.PGSCatalog.org)) project, a collaboration between the
University of Cambridge’s Department of Public Health and Primary Care (Michael Inouye, Samuel Lambert, Laurent Gil)
and the European Bioinformatics Institute (Helen Parkinson, Aoife McMahon, Ben Wingfield, Laura Harris).

A manuscript describing the tool and larger PGS Catalog Calculator pipeline
[(`PGSCatalog/pgsc_calc`)](https://github.com/PGScatalog/pgsc_calc) is in preparation. In the meantime
if you use these tools we ask you to cite the repo(s) and the paper describing the PGS Catalog resource:

- >PGS Catalog utilities _(in development)_. PGS Catalog
Team. [https://github.com/PGScatalog/pgscatalog_utils](https://github.com/PGScatalog/pgscatalog_utils)
- >PGS Catalog Calculator _(in development)_. PGS Catalog
Team. [https://github.com/PGScatalog/pgsc_calc](https://github.com/PGScatalog/pgsc_calc)
- >Lambert _et al._ (2021) The Polygenic Score Catalog as an open database for
reproducibility and systematic evaluation. Nature Genetics. 53:420–425
doi:[10.1038/s41588-021-00783-5](https://doi.org/10.1038/s41588-021-00783-5).

This work has received funding from EMBL-EBI core funds, the Baker Institute, the University of Cambridge,
Health Data Research UK (HDRUK), and the European Union's Horizon 2020 research and innovation programme
under grant agreement No 101016775 INTERVENE.
9 changes: 3 additions & 6 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
from pysqlar import SQLiteArchive
import pandas as pd
import glob


@pytest.fixture(scope="session")
Expand All @@ -21,11 +22,7 @@ def scorefiles(tmp_path_factory, pgs_accessions):
with patch('sys.argv', args):
download_scorefile()

paths: list[str] = [os.path.join(fn.resolve(), x + '.txt.gz') for x in pgs_accessions]

assert all([os.path.exists(x) for x in paths])

return paths
return glob.glob(os.path.join(fn.resolve(), "*.txt.gz"))


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -117,7 +114,7 @@ def chain_files(db, tmp_path_factory):
def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory):
out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['--liftover', '-c', chain_files, '-t', 'GRCh38',
'-m', '0.95'] + ['-o', str(out_path.resolve())]
'-m', '0.8'] + ['-o', str(out_path.resolve())]

with patch('sys.argv', args):
combine_scorefiles()
Expand Down
2 changes: 1 addition & 1 deletion pgscatalog_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.0'
__version__ = '0.1.1'
43 changes: 0 additions & 43 deletions pgscatalog_utils/download/api.py

This file was deleted.

112 changes: 90 additions & 22 deletions pgscatalog_utils/download/download_scorefile.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,58 @@
import logging
import argparse
import logging
import os
import shutil
import textwrap
from contextlib import closing
from functools import reduce
from urllib import request as request
from pgscatalog_utils.download.api import pgscatalog_result

from pgscatalog_utils.download.publication import query_publication
from pgscatalog_utils.download.score import get_url
from pgscatalog_utils.download.trait import query_trait
from pgscatalog_utils.log_config import set_logging_level

logger = logging.getLogger(__name__)


def parse_args(args=None) -> argparse.Namespace:
parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files')
parser.add_argument('-i', '--id', nargs='+', dest='pgs',
help='<Required> PGS Catalog ID', required=True)
parser.add_argument('-b', '--build', dest='build', required=True,
help='<Required> Genome build: GRCh37 or GRCh38')
parser.add_argument('-o', '--outdir', dest='outdir', required=True,
default='scores/',
help='<Required> Output directory to store downloaded files')
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help='<Optional> Extra logging information')
return parser.parse_args(args)
def download_scorefile() -> None:
args = _parse_args()
set_logging_level(args.verbose)
_check_args(args)
_mkdir(args.outdir)

if args.build is None:
logger.critical(f'Downloading scoring file(s) in the author-reported genome build')
elif args.build in ['GRCh37', 'GRCh38']:
logger.critical(f'Downloading harmonized scoring file(s) in build: {args.build}.')
else:
logger.critical(f'Invalid genome build specified: {args.build}. Only -b GRCh37 and -b GRCh38 are supported')
raise Exception

def download_scorefile() -> None:
args = parse_args()
pgs_lst: list[list[str]] = []

set_logging_level(args.verbose)
if args.efo:
logger.debug("--trait set, querying traits")
pgs_lst = pgs_lst + [query_trait(x) for x in args.efo]

_mkdir(args.outdir)
if args.pgp:
logger.debug("--pgp set, querying publications")
pgs_lst = pgs_lst + [query_publication(x) for x in args.pgp]

if args.build not in ['GRCh37', 'GRCh38']:
raise Exception(f'Invalid genome build specified: {args.build}. Only -b GRCh37 and -b GRCh38 are supported')
if args.pgs:
logger.debug("--id set, querying scores")
pgs_lst.append(args.pgs) # pgs_lst: a list containing up to three flat lists

urls: dict[str, str] = pgscatalog_result(args.pgs, args.build)
pgs_id: list[str] = list(set(reduce(lambda x, y: x + y, pgs_lst)))

urls: dict[str, str] = get_url(pgs_id, args.build)

for pgsid, url in urls.items():
logger.debug(f"Downloading {pgsid} from {url}")
path: str = os.path.join(args.outdir, pgsid + '.txt.gz')
if args.build is None:
path: str = os.path.join(args.outdir, pgsid + '.txt.gz')
else:
path: str = os.path.join(args.outdir, pgsid + f'_hmPOS_{args.build}.txt.gz')
_download_ftp(url, path)


Expand All @@ -58,5 +72,59 @@ def _download_ftp(url: str, path: str) -> None:
shutil.copyfileobj(r, f)


def _check_args(args):
if not args.efo:
if not args.pgp:
if not args.pgs:
logger.critical("One of --trait, --pgp, or --id is required to download scorefiles")
raise Exception


def _description_text() -> str:
return textwrap.dedent('''\
Download a set of scoring files from the PGS Catalog using PGS
Scoring IDs, traits, or publication IDs.
The PGS Catalog API is queried to get a list of scoring file
URLs. Scoring files are downloaded via FTP to a specified
directory. PGS Catalog scoring files are staged with the name:
{PGS_ID}.txt.gz
If a valid build is specified harmonized files are downloaded as:
{PGS_ID}_hmPOS_{genome_build}.txt.gz
These harmonised scoring files contain genomic coordinates,
remapped from author-submitted information such as rsids.
''')


def _epilog_text() -> str:
return textwrap.dedent('''\
download_scorefiles will skip downloading a scoring file if it
already exists in the download directory. This can be useful if
the download process is interrupted and needs to be restarted
later. You can track download progress with the verbose flag.
''')


def _parse_args(args=None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)')
parser.add_argument('-t', '--efo', dest='efo', nargs='+',
help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
parser.add_argument('-b', '--build', dest='build',
help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
parser.add_argument('-o', '--outdir', dest='outdir', required=True,
default='scores/',
help='<Required> Output directory to store downloaded files')
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help='<Optional> Extra logging information')
return parser.parse_args(args)


if __name__ == "__main__":
download_scorefile()
22 changes: 22 additions & 0 deletions pgscatalog_utils/download/publication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import requests
import logging
from functools import reduce

logger = logging.getLogger(__name__)


def query_publication(pgp: str) -> list[str]:
api: str = f'https://www.pgscatalog.org/rest/publication/{pgp}'
logger.debug("Querying PGS Catalog with publication PGP ID")
r: requests.models.Response = requests.get(api)

if r.json() == {}:
logger.critical(f"Bad response from PGS Catalog for EFO term: {pgp}")
raise Exception

pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids')
logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}")
return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values()))



57 changes: 57 additions & 0 deletions pgscatalog_utils/download/score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import requests
import logging
import jq
import sys

logger = logging.getLogger(__name__)


def get_url(pgs: list[str], build: str) -> dict[str, str]:
pgs_result: list[str] = []
url_result: list[str] = []

for chunk in _chunker(pgs):
try:
response = _parse_json_query(query_score(chunk), build)
pgs_result = pgs_result + list(response.keys())
url_result = url_result + list(response.values())
except TypeError:
logger.error(f"Bad response from PGS Catalog API. Is {pgs} a valid ID?")
sys.exit(1)

missing_pgs = set(pgs).difference(set(pgs_result))

if missing_pgs:
logger.warning(f"Some queries missing in PGS Catalog response: {missing_pgs}")

return dict(zip(pgs_result, url_result))


def query_score(pgs_id: list[str]) -> dict:
pgs: str = ','.join(pgs_id)
api: str = f'https://www.pgscatalog.org/rest/score/search?pgs_ids={pgs}'
r: requests.models.Response = requests.get(api)
return r.json()


def _chunker(pgs: list[str]):
size = 50 # /rest/score/{pgs_id} limit when searching multiple IDs
return(pgs[pos: pos + size] for pos in range(0, len(pgs), size))


def _parse_json_query(json: dict, build: str | None) -> dict[str, str]:
result = jq.compile(".results").input(json).first()
if not result:
logger.warning("No results in response from PGS Catalog API. Please check the PGS IDs.")
else:
return _extract_ftp_url(json, build)


def _extract_ftp_url(json: list[dict], build: str | None) -> dict[str, str]:
id: list[str] = jq.compile('[.results][][].id').input(json).all()
if build is None:
result: list[str] = jq.compile(f'[.results][][].ftp_scoring_file').input(
json).all()
else:
result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all()
return dict(zip(id, [x.replace('https', 'ftp') for x in result]))
23 changes: 23 additions & 0 deletions pgscatalog_utils/download/trait.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import requests
import logging
from functools import reduce

logger = logging.getLogger(__name__)


def query_trait(trait: str) -> list[str]:
api: str = f'https://www.pgscatalog.org/rest/trait/{trait}?include_children=1'
logger.debug(f"Querying PGS Catalog with trait {trait}")
r: requests.models.Response = requests.get(api)

if r.json() == {}:
logger.critical(f"Bad response from PGS Catalog for EFO term: {trait}")
raise Exception

keys: list[str] = ['associated_pgs_ids', 'child_associated_pgs_ids']
pgs: list[str] = []
for key in keys:
pgs.append(r.json().get(key))

logger.debug(f"Valid response from PGS Catalog for EFO term: {trait}")
return list(reduce(lambda x, y: set(x).union(set(y)), pgs))
Loading

0 comments on commit c1b5290

Please sign in to comment.