Merge pull request #17 from PGScatalog/dev

v0.1.2 release
PGScatalog · Sep 15, 2022 · 130b103 · 130b103
2 parents c1b5290 + ce8b4be
commit 130b103
Show file tree

Hide file tree

Showing 36 changed files with 908 additions and 481 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -1,5 +1,5 @@
 name: CI
-on: [push]
+on: [ push ]
 
 jobs:
   ci:

diff --git a/Dockerfile b/Dockerfile
@@ -11,8 +11,8 @@ FROM python:3.10
 
 WORKDIR /opt/
 
-COPY --from=builder /app/dist/pgscatalog_utils-0.1.1-py3-none-any.whl .
+COPY --from=builder /app/dist/pgscatalog_utils-0.1.2-py3-none-any.whl .
 
-RUN pip install pgscatalog_utils-0.1.1-py3-none-any.whl
+RUN pip install pgscatalog_utils-0.1.2-py3-none-any.whl
 
 RUN apt-get update && apt-get install -y sqlite3
diff --git a/conftest.py b/conftest.py
@@ -1,12 +1,16 @@
-import pytest
-from unittest.mock import patch
-from pgscatalog_utils.download.download_scorefile import download_scorefile
+import glob
 import os
+from unittest.mock import patch
+
+import pandas as pd
+import polars as pl
+import pytest
 import requests as req
-from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
 from pysqlar import SQLiteArchive
-import pandas as pd
-import glob
+
+from pgscatalog_utils.download.download_scorefile import download_scorefile
+from pgscatalog_utils.match.preprocess import complement_valid_alleles
+from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
 
 
 @pytest.fixture(scope="session")
@@ -66,7 +70,7 @@ def mini_score_path(tmp_path_factory):
 def mini_scorefile(mini_score_path, tmp_path_factory):
     # The mini scorefile overlaps well with cineca synthetic subset
     out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]
+    args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]
 
     with patch('sys.argv', args):
         combine_scorefiles()
@@ -78,7 +82,7 @@ def mini_scorefile(mini_score_path, tmp_path_factory):
 def combined_scorefile(scorefiles, tmp_path_factory):
     # The combined scorefile overlaps poorly with cineca synthetic subset
     out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
+    args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
 
     with patch('sys.argv', args):
         combine_scorefiles()
@@ -111,10 +115,11 @@ def chain_files(db, tmp_path_factory):
 
 
 @pytest.fixture(scope="session")
-def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory):
+def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
     out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['--liftover', '-c', chain_files, '-t', 'GRCh38',
-                                                                   '-m', '0.8'] + ['-o', str(out_path.resolve())]
+    args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t',
+                                                                          'GRCh38',
+                                                                          '-m', '0.8'] + ['-o', str(out_path.resolve())]
 
     with patch('sys.argv', args):
         combine_scorefiles()
@@ -123,15 +128,11 @@ def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory):
 
 
 @pytest.fixture(scope="session")
-def hg38_coords(tmp_path_factory):
-    out_path = tmp_path_factory.mktemp("dummy") / "hg38.txt"
+def hg38_coords():
     d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]}
     df = pd.DataFrame(d)
-    with open(out_path, 'w') as f:
-        f.write('#genome_build=GRCh38\n')
-    df.to_csv(out_path, mode='a', index=False)
-    df['filename'] = str(out_path.resolve())
     df['accession'] = 'dummy'
+    df['genome_build'] = 'GRCh38'
     return df
 
 
@@ -142,6 +143,44 @@ def hg19_coords(hg38_coords):
     return pd.DataFrame(d)
 
 
+@pytest.fixture(scope='session')
+def small_flipped_scorefile(small_scorefile):
+    # simulate a scorefile on the wrong strand
+    return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
+            .drop(['effect_allele', 'other_allele'])
+            .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
+            .pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
+
+
+@pytest.fixture(scope='session')
+def small_target():
+    return pl.DataFrame({"#CHROM": [1, 2, 3],
+                         "POS": [1, 2, 3],
+                         "REF": ["A", "T", "T"],
+                         "ALT": ["C", "A", "G"],
+                         "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
+                         "is_multiallelic": [False, False, False]})
+
+
+@pytest.fixture(scope='session')
+def small_scorefile():
+    df = pl.DataFrame({"accession": ["test", "test", "test"],
+                       "row_nr": [1, 2, 3],
+                       "chr_name": [1, 2, 3],
+                       "chr_position": [1, 2, 3],
+                       "effect_allele": ["A", "A", "G"],
+                       "other_allele": ["C", "T", "T"],
+                       "effect_weight": [1, 2, 3],
+                       "effect_type": ["additive", "additive", "additive"]})
+
+    return complement_valid_alleles(df, ["effect_allele", "other_allele"])
+
+
+@pytest.fixture(scope='session')
+def small_scorefile_no_oa(small_scorefile):
+    return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
+
+
 def _get_timeout(url):
     try:
         return req.get(url, timeout=5)

diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py
@@ -1 +1 @@
-__version__ = '0.1.1'
+__version__ = '0.1.2'
diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py
@@ -116,7 +116,7 @@ def _parse_args(args=None) -> argparse.Namespace:
     parser.add_argument('-t', '--efo', dest='efo', nargs='+',
                         help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
     parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
-    parser.add_argument('-b', '--build', dest='build',
+    parser.add_argument('-b', '--build', dest='build', choices=['GRCh37', 'GRCh38'],
                         help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
     parser.add_argument('-o', '--outdir', dest='outdir', required=True,
                         default='scores/',

diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py
@@ -1,7 +1,8 @@
-import requests
 import logging
 from functools import reduce
 
+import requests
+
 logger = logging.getLogger(__name__)
 
 
@@ -17,6 +18,3 @@ def query_publication(pgp: str) -> list[str]:
     pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids')
     logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}")
     return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values()))
-
-
-
diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py
@@ -1,8 +1,9 @@
-import requests
 import logging
-import jq
 import sys
 
+import jq
+import requests
+
 logger = logging.getLogger(__name__)
 
 
@@ -36,7 +37,7 @@ def query_score(pgs_id: list[str]) -> dict:
 
 def _chunker(pgs: list[str]):
     size = 50  # /rest/score/{pgs_id} limit when searching multiple IDs
-    return(pgs[pos: pos + size] for pos in range(0, len(pgs), size))
+    return (pgs[pos: pos + size] for pos in range(0, len(pgs), size))
 
 
 def _parse_json_query(json: dict, build: str | None) -> dict[str, str]:
@@ -53,5 +54,6 @@ def _extract_ftp_url(json: list[dict], build: str | None) -> dict[str, str]:
         result: list[str] = jq.compile(f'[.results][][].ftp_scoring_file').input(
             json).all()
     else:
-        result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all()
+        result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(
+            json).all()
     return dict(zip(id, [x.replace('https', 'ftp') for x in result]))
diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py
@@ -1,7 +1,8 @@
-import requests
 import logging
 from functools import reduce
 
+import requests
+
 logger = logging.getLogger(__name__)
 
 

diff --git a/pgscatalog_utils/log_config.py b/pgscatalog_utils/log_config.py
@@ -12,4 +12,4 @@ def set_logging_level(verbose: bool):
     else:
         logging.basicConfig(level=logging.WARNING,
                             format=log_fmt,
-                            datefmt='%Y-%m-%d %H:%M:%S')
+                            datefmt='%Y-%m-%d %H:%M:%S')
diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py
@@ -0,0 +1,55 @@
+import logging
+
+import polars as pl
+
+logger = logging.getLogger(__name__)
+
+
+def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float,
+                  dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
+    """ Check overlap between filtered matches and scorefile, remove scores that don't match well and report stats """
+    filtered_matches: pl.DataFrame = _filter_matches(matches)
+    match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset)
+                               .with_columns(pl.col('best_match').fill_null(False)))
+
+    fail_rates: pl.DataFrame = _calculate_match_rate(match_log)
+
+    scores: list[pl.DataFrame] = []
+    for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()):
+        if rate < (1 - min_overlap):
+            df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [True], 'match_rate': [1 - rate]})
+            logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%}  variants match)")
+            scores.append(df.with_column(pl.col('accession').cast(pl.Categorical)))
+        else:
+            df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [False], 'match_rate': [1 - rate]})
+            logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)")
+            scores.append(df.with_column(pl.col('accession').cast(pl.Categorical)))
+
+    score_summary: pl.DataFrame = pl.concat(scores)
+    filtered_scores: pl.DataFrame = (filtered_matches.join(score_summary, on='accession', how='left')
+                                     .filter(pl.col('score_pass') == True))
+
+    return filtered_scores, score_summary
+
+
+def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame:
+    logger.debug("Calculating overlap between target genome and scoring file")
+    return (df.groupby('accession')
+            .agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')])
+            .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate')))
+
+
+def _filter_matches(df: pl.DataFrame) -> pl.DataFrame:
+    logger.debug("Filtering variants with exclude flag")
+    return df.filter((pl.col('best_match') == True) & (pl.col('exclude') == False))
+
+
+def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame:
+    return (scorefile.join(matches, on=['row_nr', 'accession'], how='left')
+            .with_column(pl.lit(dataset).alias('dataset'))
+            .select(pl.exclude("^.*_right$")))
+
+
+def _match_keys() -> list[str]:
+    return ['chr_name', 'chr_position', 'effect_allele', 'other_allele',
+            'accession', 'effect_type', 'effect_weight']