Merge pull request #9 from PGScatalog/dev

Merge dev for release v0.1.1
PGScatalog · Aug 23, 2022 · c1b5290 · c1b5290
2 parents a6de70c + 50999ac
commit c1b5290
Show file tree

Hide file tree

Showing 24 changed files with 809 additions and 331 deletions.
diff --git a/README.md b/README.md
@@ -20,16 +20,50 @@ of `combine_scorefile` to produce scoring files for plink 2
 $ pip install pgscatalog-utils
 ```
 
-Or clone the repo:
-
-```
-$ git clone https://github.com/PGScatalog/pgscatalog_utils.git
-```
-
 ## Quickstart
 
 ```
 $ download_scorefiles -i PGS000922 PGS001229 -o . -b GRCh37
 $ combine_scorefiles -s PGS*.txt.gz -o combined.txt 
 $ match_variants -s combined.txt -t <example.pvar> --min_overlap 0.75 --outdir .
 ```
+
+More details are available using the `--help` parameter.
+
+## Install from source
+
+Requirements:
+
+- python 3.10
+- [poetry](https://python-poetry.org)
+
+```
+$ git clone https://github.com/PGScatalog/pgscatalog_utils.git
+$ cd pgscatalog_utils
+$ poetry install
+$ poetry build
+$ pip install --user dist/*.whl 
+```
+
+## Credits
+
+The `pgscatalog_utils` package is developed as part of the **Polygenic Score (PGS) Catalog** 
+([www.PGSCatalog.org](https://www.PGSCatalog.org)) project, a collaboration between the 
+University of Cambridge’s Department of Public Health and Primary Care (Michael Inouye, Samuel Lambert, Laurent Gil) 
+and the European Bioinformatics Institute (Helen Parkinson, Aoife McMahon, Ben Wingfield, Laura Harris).
+
+A manuscript describing the tool and larger PGS Catalog Calculator pipeline 
+[(`PGSCatalog/pgsc_calc`)](https://github.com/PGScatalog/pgsc_calc) is in preparation. In the meantime 
+if you use these tools we ask you to cite the repo(s) and the paper describing the PGS Catalog resource:
+
+- >PGS Catalog utilities _(in development)_. PGS Catalog
+  Team. [https://github.com/PGScatalog/pgscatalog_utils](https://github.com/PGScatalog/pgscatalog_utils)
+- >PGS Catalog Calculator _(in development)_. PGS Catalog
+  Team. [https://github.com/PGScatalog/pgsc_calc](https://github.com/PGScatalog/pgsc_calc)
+- >Lambert _et al._ (2021) The Polygenic Score Catalog as an open database for
+reproducibility and systematic evaluation.  Nature Genetics. 53:420–425
+doi:[10.1038/s41588-021-00783-5](https://doi.org/10.1038/s41588-021-00783-5).
+
+This work has received funding from EMBL-EBI core funds, the Baker Institute, the University of Cambridge, 
+Health Data Research UK (HDRUK), and the European Union's Horizon 2020 research and innovation programme 
+under grant agreement No 101016775 INTERVENE.
diff --git a/conftest.py b/conftest.py
@@ -6,6 +6,7 @@
 from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
 from pysqlar import SQLiteArchive
 import pandas as pd
+import glob
 
 
 @pytest.fixture(scope="session")
@@ -21,11 +22,7 @@ def scorefiles(tmp_path_factory, pgs_accessions):
     with patch('sys.argv', args):
         download_scorefile()
 
-    paths: list[str] = [os.path.join(fn.resolve(), x + '.txt.gz') for x in pgs_accessions]
-
-    assert all([os.path.exists(x) for x in paths])
-
-    return paths
+    return glob.glob(os.path.join(fn.resolve(), "*.txt.gz"))
 
 
 @pytest.fixture(scope="session")
@@ -117,7 +114,7 @@ def chain_files(db, tmp_path_factory):
 def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory):
     out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
     args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['--liftover', '-c', chain_files, '-t', 'GRCh38',
-                                                                   '-m', '0.95'] + ['-o', str(out_path.resolve())]
+                                                                   '-m', '0.8'] + ['-o', str(out_path.resolve())]
 
     with patch('sys.argv', args):
         combine_scorefiles()

diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py
@@ -1 +1 @@
-__version__ = '0.1.0'
+__version__ = '0.1.1'
diff --git a/pgscatalog_utils/download/api.py b/pgscatalog_utils/download/api.py
diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py
@@ -1,44 +1,58 @@
-import logging
 import argparse
+import logging
 import os
 import shutil
+import textwrap
 from contextlib import closing
+from functools import reduce
 from urllib import request as request
-from pgscatalog_utils.download.api import pgscatalog_result
+
+from pgscatalog_utils.download.publication import query_publication
+from pgscatalog_utils.download.score import get_url
+from pgscatalog_utils.download.trait import query_trait
 from pgscatalog_utils.log_config import set_logging_level
 
 logger = logging.getLogger(__name__)
 
 
-def parse_args(args=None) -> argparse.Namespace:
-    parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files')
-    parser.add_argument('-i', '--id', nargs='+', dest='pgs',
-                        help='<Required> PGS Catalog ID', required=True)
-    parser.add_argument('-b', '--build', dest='build', required=True,
-                        help='<Required> Genome build: GRCh37 or GRCh38')
-    parser.add_argument('-o', '--outdir', dest='outdir', required=True,
-                        default='scores/',
-                        help='<Required> Output directory to store downloaded files')
-    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
-                        help='<Optional> Extra logging information')
-    return parser.parse_args(args)
+def download_scorefile() -> None:
+    args = _parse_args()
+    set_logging_level(args.verbose)
+    _check_args(args)
+    _mkdir(args.outdir)
 
+    if args.build is None:
+        logger.critical(f'Downloading scoring file(s) in the author-reported genome build')
+    elif args.build in ['GRCh37', 'GRCh38']:
+        logger.critical(f'Downloading harmonized scoring file(s) in build: {args.build}.')
+    else:
+        logger.critical(f'Invalid genome build specified: {args.build}. Only -b GRCh37 and -b GRCh38 are supported')
+        raise Exception
 
-def download_scorefile() -> None:
-    args = parse_args()
+    pgs_lst: list[list[str]] = []
 
-    set_logging_level(args.verbose)
+    if args.efo:
+        logger.debug("--trait set, querying traits")
+        pgs_lst = pgs_lst + [query_trait(x) for x in args.efo]
 
-    _mkdir(args.outdir)
+    if args.pgp:
+        logger.debug("--pgp set, querying publications")
+        pgs_lst = pgs_lst + [query_publication(x) for x in args.pgp]
 
-    if args.build not in ['GRCh37', 'GRCh38']:
-        raise Exception(f'Invalid genome build specified: {args.build}. Only -b GRCh37 and -b GRCh38 are supported')
+    if args.pgs:
+        logger.debug("--id set, querying scores")
+        pgs_lst.append(args.pgs)  # pgs_lst: a list containing up to three flat lists
 
-    urls: dict[str, str] = pgscatalog_result(args.pgs, args.build)
+    pgs_id: list[str] = list(set(reduce(lambda x, y: x + y, pgs_lst)))
+
+    urls: dict[str, str] = get_url(pgs_id, args.build)
 
     for pgsid, url in urls.items():
         logger.debug(f"Downloading {pgsid} from {url}")
-        path: str = os.path.join(args.outdir, pgsid + '.txt.gz')
+        if args.build is None:
+            path: str = os.path.join(args.outdir, pgsid + '.txt.gz')
+        else:
+            path: str = os.path.join(args.outdir, pgsid + f'_hmPOS_{args.build}.txt.gz')
         _download_ftp(url, path)
 
 
@@ -58,5 +72,59 @@ def _download_ftp(url: str, path: str) -> None:
                 shutil.copyfileobj(r, f)
 
 
+def _check_args(args):
+    if not args.efo:
+        if not args.pgp:
+            if not args.pgs:
+                logger.critical("One of --trait, --pgp, or --id is required to download scorefiles")
+                raise Exception
+
+
+def _description_text() -> str:
+    return textwrap.dedent('''\
+    Download a set of scoring files from the PGS Catalog using PGS
+    Scoring IDs, traits, or publication IDs.
+    
+    The PGS Catalog API is queried to get a list of scoring file
+    URLs. Scoring files are downloaded via FTP to a specified
+    directory. PGS Catalog scoring files are staged with the name:
+
+            {PGS_ID}.txt.gz
+
+    If a valid build is specified harmonized files are downloaded as:
+    
+        {PGS_ID}_hmPOS_{genome_build}.txt.gz
+    
+    These harmonised scoring files contain genomic coordinates,
+    remapped from author-submitted information such as rsids.
+   ''')
+
+
+def _epilog_text() -> str:
+    return textwrap.dedent('''\
+    download_scorefiles will skip downloading a scoring file if it
+    already exists in the download directory. This can be useful if
+    the download process is interrupted and needs to be restarted
+    later. You can track download progress with the verbose flag.    
+   ''')
+
+
+def _parse_args(args=None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)')
+    parser.add_argument('-t', '--efo', dest='efo', nargs='+',
+                        help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
+    parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
+    parser.add_argument('-b', '--build', dest='build',
+                        help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
+    parser.add_argument('-o', '--outdir', dest='outdir', required=True,
+                        default='scores/',
+                        help='<Required> Output directory to store downloaded files')
+    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
+                        help='<Optional> Extra logging information')
+    return parser.parse_args(args)
+
+
 if __name__ == "__main__":
     download_scorefile()
diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py
@@ -0,0 +1,22 @@
+import requests
+import logging
+from functools import reduce
+
+logger = logging.getLogger(__name__)
+
+
+def query_publication(pgp: str) -> list[str]:
+    api: str = f'https://www.pgscatalog.org/rest/publication/{pgp}'
+    logger.debug("Querying PGS Catalog with publication PGP ID")
+    r: requests.models.Response = requests.get(api)
+
+    if r.json() == {}:
+        logger.critical(f"Bad response from PGS Catalog for EFO term: {pgp}")
+        raise Exception
+
+    pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids')
+    logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}")
+    return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values()))
+
+
+
diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py
@@ -0,0 +1,57 @@
+import requests
+import logging
+import jq
+import sys
+
+logger = logging.getLogger(__name__)
+
+
+def get_url(pgs: list[str], build: str) -> dict[str, str]:
+    pgs_result: list[str] = []
+    url_result: list[str] = []
+
+    for chunk in _chunker(pgs):
+        try:
+            response = _parse_json_query(query_score(chunk), build)
+            pgs_result = pgs_result + list(response.keys())
+            url_result = url_result + list(response.values())
+        except TypeError:
+            logger.error(f"Bad response from PGS Catalog API. Is {pgs} a valid ID?")
+            sys.exit(1)
+
+    missing_pgs = set(pgs).difference(set(pgs_result))
+
+    if missing_pgs:
+        logger.warning(f"Some queries missing in PGS Catalog response: {missing_pgs}")
+
+    return dict(zip(pgs_result, url_result))
+
+
+def query_score(pgs_id: list[str]) -> dict:
+    pgs: str = ','.join(pgs_id)
+    api: str = f'https://www.pgscatalog.org/rest/score/search?pgs_ids={pgs}'
+    r: requests.models.Response = requests.get(api)
+    return r.json()
+
+
+def _chunker(pgs: list[str]):
+    size = 50  # /rest/score/{pgs_id} limit when searching multiple IDs
+    return(pgs[pos: pos + size] for pos in range(0, len(pgs), size))
+
+
+def _parse_json_query(json: dict, build: str | None) -> dict[str, str]:
+    result = jq.compile(".results").input(json).first()
+    if not result:
+        logger.warning("No results in response from PGS Catalog API. Please check the PGS IDs.")
+    else:
+        return _extract_ftp_url(json, build)
+
+
+def _extract_ftp_url(json: list[dict], build: str | None) -> dict[str, str]:
+    id: list[str] = jq.compile('[.results][][].id').input(json).all()
+    if build is None:
+        result: list[str] = jq.compile(f'[.results][][].ftp_scoring_file').input(
+            json).all()
+    else:
+        result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all()
+    return dict(zip(id, [x.replace('https', 'ftp') for x in result]))
diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py
@@ -0,0 +1,23 @@
+import requests
+import logging
+from functools import reduce
+
+logger = logging.getLogger(__name__)
+
+
+def query_trait(trait: str) -> list[str]:
+    api: str = f'https://www.pgscatalog.org/rest/trait/{trait}?include_children=1'
+    logger.debug(f"Querying PGS Catalog with trait {trait}")
+    r: requests.models.Response = requests.get(api)
+
+    if r.json() == {}:
+        logger.critical(f"Bad response from PGS Catalog for EFO term: {trait}")
+        raise Exception
+
+    keys: list[str] = ['associated_pgs_ids', 'child_associated_pgs_ids']
+    pgs: list[str] = []
+    for key in keys:
+        pgs.append(r.json().get(key))
+
+    logger.debug(f"Valid response from PGS Catalog for EFO term: {trait}")
+    return list(reduce(lambda x, y: set(x).union(set(y)), pgs))