From 44c487799e525706daf080062c90794303fc64f7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 8 Aug 2022 17:43:53 +0100 Subject: [PATCH 01/47] download scorefiles with multiple EFO and PGP terms --- pgscatalog_utils/download/api.py | 43 --------------- .../download/download_scorefile.py | 47 +++++++++++++--- pgscatalog_utils/download/publication.py | 22 ++++++++ pgscatalog_utils/download/score.py | 53 +++++++++++++++++++ pgscatalog_utils/download/trait.py | 23 ++++++++ tests/test_download.py | 16 +++++- 6 files changed, 151 insertions(+), 53 deletions(-) delete mode 100644 pgscatalog_utils/download/api.py create mode 100644 pgscatalog_utils/download/publication.py create mode 100644 pgscatalog_utils/download/score.py create mode 100644 pgscatalog_utils/download/trait.py diff --git a/pgscatalog_utils/download/api.py b/pgscatalog_utils/download/api.py deleted file mode 100644 index 8fdb1fe..0000000 --- a/pgscatalog_utils/download/api.py +++ /dev/null @@ -1,43 +0,0 @@ -import requests -import jq -import logging -import sys - -logger = logging.getLogger(__name__) - - -def pgscatalog_result(pgs: list[str], build: str) -> dict[str, str]: - result = _parse_json_query(_api_query(pgs), build) - - try: - if len(pgs) > len(result): - missing_pgs: set[str] = set(pgs).difference(set(result.keys())) - logger.warning(f"Some queries missing in PGS Catalog response: {missing_pgs}") - except TypeError: - logger.error(f"Bad response from PGS Catalog API. Is {pgs} a valid ID?") - sys.exit(1) - - return result - - -def _api_query(pgs_id: list[str]) -> dict: - pgs: str = ','.join(pgs_id) - api: str = f'https://www.pgscatalog.org/rest/score/search?pgs_ids={pgs}' - r: requests.models.Response = requests.get(api) - return r.json() - - -def _parse_json_query(json: dict, build: str) -> dict[str, str]: - result = jq.compile(".results").input(json).first() - if not result: - logger.warning("No results in response from PS Catalog API. Please check the PGS IDs.") - else: - return _extract_ftp_url(json, build) - - -def _extract_ftp_url(json: list[dict], build: str) -> dict[str, str]: - id: list[str] = jq.compile('[.results][][].id').input(json).all() - result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all() - return dict(zip(id, [x.replace('https', 'ftp') for x in result])) - - diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index 74a79e1..7c88d21 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -3,8 +3,13 @@ import os import shutil from contextlib import closing +from functools import reduce from urllib import request as request -from pgscatalog_utils.download.api import pgscatalog_result +import sys + +from pgscatalog_utils.download.publication import query_publication +from pgscatalog_utils.download.score import get_url +from pgscatalog_utils.download.trait import query_trait from pgscatalog_utils.log_config import set_logging_level logger = logging.getLogger(__name__) @@ -12,29 +17,47 @@ def parse_args(args=None) -> argparse.Namespace: parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files') - parser.add_argument('-i', '--id', nargs='+', dest='pgs', - help=' PGS Catalog ID', required=True) + parser.add_argument('-i', '--id', nargs='+', dest='pgs', help='PGS Catalog ID') + parser.add_argument('-t', '--trait', dest='efo', nargs='+', + help='Traits described by an EFO term') + parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication IDs', nargs='+') parser.add_argument('-b', '--build', dest='build', required=True, help=' Genome build: GRCh37 or GRCh38') parser.add_argument('-o', '--outdir', dest='outdir', required=True, default='scores/', help=' Output directory to store downloaded files') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help=' Extra logging information') + help='Extra logging information') return parser.parse_args(args) def download_scorefile() -> None: args = parse_args() - set_logging_level(args.verbose) - + _check_args(args) _mkdir(args.outdir) if args.build not in ['GRCh37', 'GRCh38']: - raise Exception(f'Invalid genome build specified: {args.build}. Only -b GRCh37 and -b GRCh38 are supported') + logger.critical(f'Invalid genome build specified: {args.build}. Only -b GRCh37 and -b GRCh38 are supported') + raise Exception + + pgs_lst: list[list[str]] = [] + + if args.efo: + logger.debug("--trait set, querying traits") + pgs_lst = pgs_lst + [query_trait(x) for x in args.efo] - urls: dict[str, str] = pgscatalog_result(args.pgs, args.build) + if args.pgp: + logger.debug("--pgp set, querying publications") + pgs_lst = pgs_lst + [query_publication(x) for x in args.pgp] + + if args.pgs: + logger.debug("--id set, querying scores") + pgs_lst.append(args.pgs) # pgs_lst: a list containing up to three flat lists + + pgs_id: list[str] = list(set(reduce(lambda x, y: x + y, pgs_lst))) + + urls: dict[str, str] = get_url(pgs_id, args.build) for pgsid, url in urls.items(): logger.debug(f"Downloading {pgsid} from {url}") @@ -58,5 +81,13 @@ def _download_ftp(url: str, path: str) -> None: shutil.copyfileobj(r, f) +def _check_args(args): + if not args.efo: + if not args.pgp: + if not args.pgs: + logger.critical("One of --trait, --pgp, or --id is required to download scorefiles") + raise Exception + + if __name__ == "__main__": download_scorefile() diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py new file mode 100644 index 0000000..b5e90fa --- /dev/null +++ b/pgscatalog_utils/download/publication.py @@ -0,0 +1,22 @@ +import requests +import logging +from functools import reduce + +logger = logging.getLogger(__name__) + + +def query_publication(pgp: str) -> list[str]: + api: str = f'https://www.pgscatalog.org/rest/publication/{pgp}' + logger.debug("Querying PGS Catalog with publication PGP ID") + r: requests.models.Response = requests.get(api) + + if r.json() == {}: + logger.critical(f"Bad response from PGS Catalog for EFO term: {pgp}") + raise Exception + + pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids') + logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}") + return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values())) + + + diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py new file mode 100644 index 0000000..c6049d6 --- /dev/null +++ b/pgscatalog_utils/download/score.py @@ -0,0 +1,53 @@ +import requests +import logging +import jq +import sys + +logger = logging.getLogger(__name__) + + +def get_url(pgs: list[str], build: str) -> dict[str, str]: + pgs_result: list[str] = [] + url_result: list[str] = [] + + for chunk in _chunker(pgs): + try: + response = _parse_json_query(_query_score(chunk), build) + pgs_result = pgs_result + list(response.keys()) + url_result = url_result + list(response.values()) + except TypeError: + logger.error(f"Bad response from PGS Catalog API. Is {pgs} a valid ID?") + sys.exit(1) + + missing_pgs = set(pgs).difference(set(pgs_result)) + + if missing_pgs: + logger.warning(f"Some queries missing in PGS Catalog response: {missing_pgs}") + + return dict(zip(pgs_result, url_result)) + + +def _chunker(pgs: list[str]): + size = 50 # /rest/score/{pgs_id} limit when searching multiple IDs + return(pgs[pos: pos + size] for pos in range(0, len(pgs), size)) + + +def _query_score(pgs_id: list[str]) -> dict: + pgs: str = ','.join(pgs_id) + api: str = f'https://www.pgscatalog.org/rest/score/search?pgs_ids={pgs}' + r: requests.models.Response = requests.get(api) + return r.json() + + +def _parse_json_query(json: dict, build: str) -> dict[str, str]: + result = jq.compile(".results").input(json).first() + if not result: + logger.warning("No results in response from PGS Catalog API. Please check the PGS IDs.") + else: + return _extract_ftp_url(json, build) + + +def _extract_ftp_url(json: list[dict], build: str) -> dict[str, str]: + id: list[str] = jq.compile('[.results][][].id').input(json).all() + result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all() + return dict(zip(id, [x.replace('https', 'ftp') for x in result])) diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py new file mode 100644 index 0000000..981b40d --- /dev/null +++ b/pgscatalog_utils/download/trait.py @@ -0,0 +1,23 @@ +import requests +import logging +from functools import reduce + +logger = logging.getLogger(__name__) + + +def query_trait(trait: str) -> list[str]: + api: str = f'https://www.pgscatalog.org/rest/trait/{trait}?include_children=1' + logger.debug(f"Querying PGS Catalog with trait {trait}") + r: requests.models.Response = requests.get(api) + + if r.json() == {}: + logger.critical(f"Bad response from PGS Catalog for EFO term: {trait}") + raise Exception + + keys: list[str] = ['associated_pgs_ids', 'child_associated_pgs_ids'] + pgs: list[str] = [] + for key in keys: + pgs.append(r.json().get(key)) + + logger.debug(f"Valid response from PGS Catalog for EFO term: {trait}") + return list(reduce(lambda x, y: set(x).union(set(y)), pgs)) diff --git a/tests/test_download.py b/tests/test_download.py index 78f1c83..0eaa210 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -1,13 +1,16 @@ import os import pytest from unittest.mock import patch -from pgscatalog_utils.download.api import pgscatalog_result + +from pgscatalog_utils.download.trait import query_trait +from pgscatalog_utils.download.publication import query_publication +from pgscatalog_utils.download.score import get_url from pgscatalog_utils.download.download_scorefile import download_scorefile @pytest.fixture(params=[["PGS000001"], ["PGS000001", "PGS000802"]]) def pgscatalog_api(request): - return pgscatalog_result(request.param, "GRCh37") + return get_url(request.param, "GRCh37") def test_pgscatalog_result(pgscatalog_api): @@ -29,3 +32,12 @@ def test_download_scorefile(tmp_path): download_scorefile() assert os.listdir(out_dir) == ['PGS000001.txt.gz'] + +def test_query_publication(): + # publications are relatively static + assert not set(query_publication("PGP000001")).difference(['PGS000001', 'PGS000002', 'PGS000003']) + + +def test_query_trait(): + # new scores may be added to traits in the future + assert {'PGS001901', 'PGS002115'}.issubset(set(query_trait("EFO_0004329"))) From b620c36ebc2d881543c3dcfdebac3ca2a14ca4b4 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 8 Aug 2022 17:50:36 +0100 Subject: [PATCH 02/47] match_variants per chromosome --- pgscatalog_utils/match/match_variants.py | 14 ++++++++++---- pgscatalog_utils/match/read.py | 8 +++++--- pgscatalog_utils/match/write.py | 10 ++++------ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 2d2a632..c9689a9 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -16,13 +16,19 @@ def match_variants(): set_logging_level(args.verbose) scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) - target: pl.DataFrame = read_target(path=args.target, n_threads=args.n_threads, - remove_multiallelic=args.remove_multiallelic) - dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator + match_lst: list[pl.DataFrame] = [] with pl.StringCache(): - matches: pl.DataFrame = get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous) + for chrom in scorefile['chr_name'].unique().to_list(): + logger.debug(f'Matching chromosome {chrom} against target') + target: pl.DataFrame = read_target(path=args.target, chrom=chrom, n_threads=args.n_threads, + remove_multiallelic=args.remove_multiallelic) + matches: pl.DataFrame = get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous) + match_lst.append(matches) + + matches: pl.DataFrame = pl.concat(match_lst) + dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator check_match_rate(scorefile, matches, args.min_overlap, dataset) if matches.shape[0] == 0: # this can happen if args.min_overlap = 0 diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 138855d..bf2a677 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -7,10 +7,12 @@ logger = logging.getLogger(__name__) -def read_target(path: str, n_threads: int, remove_multiallelic: bool) -> pl.DataFrame: +def read_target(path: str, chrom: str, n_threads: int, remove_multiallelic: bool) -> pl.DataFrame: target: Target = _detect_target_format(path) d = {'column_1': str} # column_1 is always CHROM. CHROM must always be a string - df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, n_threads=n_threads) + df: pl.DataFrame = (pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, n_threads=n_threads) + .filter(pl.col('column_1') == chrom) + .collect()) df.columns = target.header match target.file_format: @@ -44,7 +46,7 @@ def _detect_target_format(path: str) -> Target: header: list[str] if "*" in path: - logger.debug("Wildcard detected in target path, guessing format from first match") + logger.debug("Detecting target file format") path = glob.glob(path)[0] # guess format from first file in directory with open(path, 'rt') as f: diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 45b74dc..df7e319 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -9,7 +9,7 @@ def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: logger.debug("Splitting by effect type") effect_types: dict[str, pl.DataFrame] = _split_effect_type(df) logger.debug("Deduplicating variants") - deduplicated: dict[str, pl.DataFrame] = {k: _deduplicate_variants(v) for k, v in effect_types.items()} + deduplicated: dict[str, pl.DataFrame] = {k: _deduplicate_variants(k, v) for k, v in effect_types.items()} ea_dict: dict[str, str] = {'is_dominant': 'dominant', 'is_recessive': 'recessive', 'additive': 'additive'} logger.debug("Writing out scorefiles") [_write_scorefile(ea_dict.get(k), v, split, outdir, dataset) for k, v in deduplicated.items()] @@ -21,8 +21,6 @@ def write_log(df: pl.DataFrame, dataset: str) -> None: def _write_scorefile(effect_type: str, scorefiles: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: """ Write a list of scorefiles with the same effect type """ - fout: str = '{dataset}_{chr}_{et}_{split}.scorefile' - # each list element contains a dataframe of variants # lists are split to ensure variants have unique ID - effect alleles for i, scorefile in enumerate(scorefiles): @@ -63,7 +61,7 @@ def _split_effect_type(df: pl.DataFrame) -> dict[str, pl.DataFrame]: return {x: df.filter(pl.col("effect_type") == x) for x in effect_types} -def _deduplicate_variants(df: pl.DataFrame) -> list[pl.DataFrame]: +def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFrame]: """ Find variant matches that have duplicate identifiers When merging a lot of scoring files, sometimes a variant might be duplicated this can happen when the effect allele differs at the same position, e.g.: @@ -101,9 +99,9 @@ def _deduplicate_variants(df: pl.DataFrame) -> list[pl.DataFrame]: df_lst.append(x) if len(df_lst) > 1: - logger.debug("Duplicate variant identifiers split") + logger.debug(f"Duplicate variant identifiers split for effect type {effect_type}") else: - logger.debug("No duplicate variant identifiers found") + logger.debug(f"No duplicate variant identifiers found for effect type {effect_type}") assert n_var == df.shape[0] From 6de8864c7004517a4543ec7f2fdf58e251b30817 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 9 Aug 2022 16:02:42 +0100 Subject: [PATCH 03/47] update dependencies --- poetry.lock | 21 +++++++++++++++------ pyproject.toml | 1 + 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 28f0366..aa9f303 100644 --- a/poetry.lock +++ b/poetry.lock @@ -49,7 +49,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "coverage" -version = "6.4.2" +version = "6.4.3" description = "Code coverage measurement for Python" category = "dev" optional = false @@ -134,19 +134,19 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.13.59" +version = "0.13.62" description = "Blazingly fast DataFrame library" category = "main" optional = false python-versions = ">=3.7" [package.extras] -connectorx = ["connectorx"] -numpy = ["numpy (>=1.16.0)"] fsspec = ["fsspec"] -pandas = ["pyarrow (>=4.0)", "pandas"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] +connectorx = ["connectorx"] +pandas = ["pyarrow (>=4.0)", "pandas"] pyarrow = ["pyarrow (>=4.0)"] +numpy = ["numpy (>=1.16.0)"] [[package]] name = "py" @@ -272,6 +272,14 @@ category = "dev" optional = false python-versions = ">=3.7" +[[package]] +name = "typing-extensions" +version = "4.3.0" +description = "Backported and Experimental Type Hints for Python 3.7+" +category = "main" +optional = false +python-versions = ">=3.7" + [[package]] name = "urllib3" version = "1.26.11" @@ -288,7 +296,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [metadata] lock-version = "1.1" python-versions = "^3.10" -content-hash = "31cffdaa5cb10864005af569ed7ab3142071abe6934d06789bac6a00ca2ba1ee" +content-hash = "ba8bce34f2ac27003140746ab76478f99a4a0176303fca52ea1c6548f7672f6b" [metadata.files] atomicwrites = [] @@ -364,4 +372,5 @@ tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +typing-extensions = [] urllib3 = [] diff --git a/pyproject.toml b/pyproject.toml index b3968f7..3441e00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ pyliftover = "^0.4" requests = "^2.28.1" jq = "^1.2.2" polars = "^0.13.59" +typing-extensions = "^4.3.0" [tool.poetry.dev-dependencies] pytest = "^7.1.2" From fad43fda89508f1ab08a3f459fe1937da7180379 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 10 Aug 2022 11:32:25 +0100 Subject: [PATCH 04/47] add is_multiallelic to log --- pgscatalog_utils/match/match.py | 4 ++-- pgscatalog_utils/match/preprocess.py | 21 +++++++++++++++++---- pgscatalog_utils/match/read.py | 5 ++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index b0803de..7421a5f 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -92,7 +92,7 @@ def _post_match(df: pl.DataFrame, return df.with_columns([pl.col("*"), pl.col("effect_allele").alias(effect_allele), pl.col("other_allele").alias(other_allele), - pl.lit(match_type).alias("match_type") + pl.lit(match_type).alias("match_type"), ])[_matched_colnames()] @@ -132,4 +132,4 @@ def _target_keys(effect_allele: str, other_allele: str) -> list[str]: def _matched_colnames() -> list[str]: return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession', - 'ID', 'REF', 'ALT', 'REF_FLIP', 'ALT_FLIP', 'match_type'] + 'ID', 'REF', 'ALT', 'REF_FLIP', 'ALT_FLIP', 'match_type', 'is_multiallelic'] diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index be64d7e..3c5ce73 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -31,13 +31,22 @@ def ugly_complement(df: pl.DataFrame) -> pl.DataFrame: ]) -def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataFrame: - is_ma: pl.Series = df['ALT'].str.contains(',') # plink2 pvar multi-alleles are comma-separated - if is_ma.sum() > 0: +def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) -> pl.DataFrame: + # plink2 pvar multi-alleles are comma-separated + df: pl.DataFrame = (df.with_column( + pl.when(pl.col("ALT").str.contains(',')) + .then(pl.lit(True)) + .otherwise(pl.lit(False)) + .alias('is_multiallelic'))) + + if df['is_multiallelic'].sum() > 0: logger.debug("Multiallelic variants detected") if remove_multiallelic: + if not pvar: + logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic " + "variant representations only") logger.debug('Dropping multiallelic variants') - return df[~is_ma] + return df[~df['is_multiallelic']] else: logger.debug("Exploding dataframe to handle multiallelic variants") df.replace('ALT', df['ALT'].str.split(by=',')) # turn ALT to list of variants @@ -53,3 +62,7 @@ def check_weights(df: pl.DataFrame) -> None: if any(weight_count > 1): logger.error("Multiple effect weights per variant per accession detected") raise Exception + + +def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame: + df.with_column(pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic')) \ No newline at end of file diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index bf2a677..b920fb7 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -18,10 +18,11 @@ def read_target(path: str, chrom: str, n_threads: int, remove_multiallelic: bool match target.file_format: case 'bim': return (df[_default_cols()] + .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=False) .pipe(ugly_complement)) case 'pvar': return (df[_default_cols()] - .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic) + .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=True) .pipe(ugly_complement)) case _: logger.error("Invalid file format detected") @@ -81,5 +82,3 @@ def _pvar_header(path: str) -> list[str]: def _bim_header() -> list[str]: return ['#CHROM', 'ID', 'CM', 'POS', 'REF', 'ALT'] - - From dc08ed966e6cb9ec4ee5b834ea498eed7518beec Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 10 Aug 2022 11:46:28 +0100 Subject: [PATCH 05/47] clean up --- README.md | 21 +++++++++++++------ .../download/download_scorefile.py | 2 +- pgscatalog_utils/match/match_variants.py | 2 +- poetry.lock | 11 +--------- pyproject.toml | 1 - 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 43f9f1b..5cfbd02 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,6 @@ of `combine_scorefile` to produce scoring files for plink 2 $ pip install pgscatalog-utils ``` -Or clone the repo: - -``` -$ git clone https://github.com/PGScatalog/pgscatalog_utils.git -``` - ## Quickstart ``` @@ -33,3 +27,18 @@ $ download_scorefiles -i PGS000922 PGS001229 -o . -b GRCh37 $ combine_scorefiles -s PGS*.txt.gz -o combined.txt $ match_variants -s combined.txt -t --min_overlap 0.75 --outdir . ``` + +## Install from source + +Requirements: + +- python 3.10 +- [poetry](https://python-poetry.org) + +``` +$ git clone https://github.com/PGScatalog/pgscatalog_utils.git +$ cd pgscatalog_utils +$ poetry install +$ poetry build +$ pip install --user dist/*.whl +``` \ No newline at end of file diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index 7c88d21..3684f5a 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -16,7 +16,7 @@ def parse_args(args=None) -> argparse.Namespace: - parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files') + parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files from the PGS Catalog') parser.add_argument('-i', '--id', nargs='+', dest='pgs', help='PGS Catalog ID') parser.add_argument('-t', '--trait', dest='efo', nargs='+', help='Traits described by an EFO term') diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index c9689a9..6a71ac2 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -39,7 +39,7 @@ def match_variants(): def _parse_args(args=None): - parser = argparse.ArgumentParser(description='Read and format scoring files') + parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants') parser.add_argument('-d', '--dataset', dest='dataset', required=True, help=' Label for target genomic dataset (e.g. "-d thousand_genomes")') parser.add_argument('-s', '--scorefiles', dest='scorefile', required=True, diff --git a/poetry.lock b/poetry.lock index aa9f303..e920a73 100644 --- a/poetry.lock +++ b/poetry.lock @@ -272,14 +272,6 @@ category = "dev" optional = false python-versions = ">=3.7" -[[package]] -name = "typing-extensions" -version = "4.3.0" -description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" -optional = false -python-versions = ">=3.7" - [[package]] name = "urllib3" version = "1.26.11" @@ -296,7 +288,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [metadata] lock-version = "1.1" python-versions = "^3.10" -content-hash = "ba8bce34f2ac27003140746ab76478f99a4a0176303fca52ea1c6548f7672f6b" +content-hash = "31cffdaa5cb10864005af569ed7ab3142071abe6934d06789bac6a00ca2ba1ee" [metadata.files] atomicwrites = [] @@ -372,5 +364,4 @@ tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] -typing-extensions = [] urllib3 = [] diff --git a/pyproject.toml b/pyproject.toml index 3441e00..b3968f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ pyliftover = "^0.4" requests = "^2.28.1" jq = "^1.2.2" polars = "^0.13.59" -typing-extensions = "^4.3.0" [tool.poetry.dev-dependencies] pytest = "^7.1.2" From 501775e52cd95ceda690a5172639ea00fa5532b7 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 10 Aug 2022 12:43:55 +0100 Subject: [PATCH 06/47] Make the output directory when it doesn't exist --- pgscatalog_utils/match/write.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index df7e319..50bbdba 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -6,6 +6,8 @@ def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: + if os.path.isdir(outdir) is False: + os.mkdir(outdir) logger.debug("Splitting by effect type") effect_types: dict[str, pl.DataFrame] = _split_effect_type(df) logger.debug("Deduplicating variants") From ceaf91ab9b69cc0957957411349d325fc79b75db Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 10 Aug 2022 12:45:09 +0100 Subject: [PATCH 07/47] =?UTF-8?q?Refactor=20the=20looping=20of=20scoring?= =?UTF-8?q?=20files=20and=20target=20variants:=20Using=20the=20chromosome?= =?UTF-8?q?=20causes=20it=20to=20fail=20if=20you=20list=20of=20pvar=20file?= =?UTF-8?q?s=20doesn=E2=80=99t=20work=20if=20there=20is=20a=20chromosome?= =?UTF-8?q?=20in=20the=20scoring=20file=20that=20it=E2=80=99s=20present=20?= =?UTF-8?q?in=20the=20target.=20I=20re-wrote=20it=20to=20loop=20through=20?= =?UTF-8?q?pvars=20instead.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pgscatalog_utils/match/match_variants.py | 16 ++++++++-------- pgscatalog_utils/match/read.py | 6 ++---- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index c9689a9..4dae163 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,6 +1,7 @@ import argparse import logging import polars as pl +from glob import glob from pgscatalog_utils.match.postprocess import postprocess_matches from pgscatalog_utils.log_config import set_logging_level @@ -17,17 +18,16 @@ def match_variants(): scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) - match_lst: list[pl.DataFrame] = [] - with pl.StringCache(): - for chrom in scorefile['chr_name'].unique().to_list(): - logger.debug(f'Matching chromosome {chrom} against target') - target: pl.DataFrame = read_target(path=args.target, chrom=chrom, n_threads=args.n_threads, + for i, loc_target_current in enumerate(glob(args.target)): + logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') + target: pl.DataFrame = read_target(path=loc_target_current, n_threads=args.n_threads, remove_multiallelic=args.remove_multiallelic) - matches: pl.DataFrame = get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous) - match_lst.append(matches) + if i == 0: + matches: pl.DataFrame = get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous) + else: + matches: pl.DataFrame = pl.concat([matches, get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous)]) - matches: pl.DataFrame = pl.concat(match_lst) dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator check_match_rate(scorefile, matches, args.min_overlap, dataset) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index bf2a677..9d9068b 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -7,12 +7,10 @@ logger = logging.getLogger(__name__) -def read_target(path: str, chrom: str, n_threads: int, remove_multiallelic: bool) -> pl.DataFrame: +def read_target(path: str, n_threads: int, remove_multiallelic: bool) -> pl.DataFrame: target: Target = _detect_target_format(path) d = {'column_1': str} # column_1 is always CHROM. CHROM must always be a string - df: pl.DataFrame = (pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, n_threads=n_threads) - .filter(pl.col('column_1') == chrom) - .collect()) + df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, n_threads=n_threads) df.columns = target.header match target.file_format: From 028ec19c542510952a767cc80c36701980b0a251 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 10 Aug 2022 13:10:27 +0100 Subject: [PATCH 08/47] Adapt download_scorefile to download both author-reported scoring files and harmonized files in builds GRCh37/38. Ensures scorefile naming matches the files on the FTP (e.x. PGS002237): - Build = None -> PGS002237.txt.gz - Build = GRCh37 -> PGS002237_hmPOS_GRCh37.txt.gz - Build = GRCh38 -> PGS002237_hmPOS_GRCh38.txt.gz --- .../download/download_scorefile.py | 23 ++++++++++++------- pgscatalog_utils/download/score.py | 10 +++++--- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index 7c88d21..f5aeb1f 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -17,12 +17,12 @@ def parse_args(args=None) -> argparse.Namespace: parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files') - parser.add_argument('-i', '--id', nargs='+', dest='pgs', help='PGS Catalog ID') - parser.add_argument('-t', '--trait', dest='efo', nargs='+', - help='Traits described by an EFO term') - parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication IDs', nargs='+') - parser.add_argument('-b', '--build', dest='build', required=True, - help=' Genome build: GRCh37 or GRCh38') + parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)') + parser.add_argument('-t', '--efo', dest='efo', nargs='+', + help='Traits described by an EFO term(s) (e.g. EFO_0004611)') + parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+') + parser.add_argument('-b', '--build', dest='build', + help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38') parser.add_argument('-o', '--outdir', dest='outdir', required=True, default='scores/', help=' Output directory to store downloaded files') @@ -37,7 +37,11 @@ def download_scorefile() -> None: _check_args(args) _mkdir(args.outdir) - if args.build not in ['GRCh37', 'GRCh38']: + if args.build is None: + logger.critical(f'Downloading scoring file(s) in the author-reported genome build') + elif args.build in ['GRCh37', 'GRCh38']: + logger.critical(f'Downloading harmonized scoring file(s) in build: {args.build}.') + else: logger.critical(f'Invalid genome build specified: {args.build}. Only -b GRCh37 and -b GRCh38 are supported') raise Exception @@ -61,7 +65,10 @@ def download_scorefile() -> None: for pgsid, url in urls.items(): logger.debug(f"Downloading {pgsid} from {url}") - path: str = os.path.join(args.outdir, pgsid + '.txt.gz') + if args.build is None: + path: str = os.path.join(args.outdir, pgsid + '.txt.gz') + else: + path: str = os.path.join(args.outdir, pgsid + f'_hmPOS_{args.build}.txt.gz') _download_ftp(url, path) diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py index c6049d6..546cc6f 100644 --- a/pgscatalog_utils/download/score.py +++ b/pgscatalog_utils/download/score.py @@ -39,7 +39,7 @@ def _query_score(pgs_id: list[str]) -> dict: return r.json() -def _parse_json_query(json: dict, build: str) -> dict[str, str]: +def _parse_json_query(json: dict, build: str | None) -> dict[str, str]: result = jq.compile(".results").input(json).first() if not result: logger.warning("No results in response from PGS Catalog API. Please check the PGS IDs.") @@ -47,7 +47,11 @@ def _parse_json_query(json: dict, build: str) -> dict[str, str]: return _extract_ftp_url(json, build) -def _extract_ftp_url(json: list[dict], build: str) -> dict[str, str]: +def _extract_ftp_url(json: list[dict], build: str | None) -> dict[str, str]: id: list[str] = jq.compile('[.results][][].id').input(json).all() - result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all() + if build is None: + result: list[str] = jq.compile(f'[.results][][].ftp_scoring_file').input( + json).all() + else: + result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all() return dict(zip(id, [x.replace('https', 'ftp') for x in result])) From d62586cbbc1d0f988c2b78c95d4bf1eb0cbdbe82 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 10 Aug 2022 13:31:20 +0100 Subject: [PATCH 09/47] Update tests --- tests/test_download.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_download.py b/tests/test_download.py index 0eaa210..611740e 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -24,14 +24,22 @@ def test_pgscatalog_result(pgscatalog_api): assert v.endswith(".txt.gz") -def test_download_scorefile(tmp_path): +def test_download_scorefile_author(tmp_path): out_dir = str(tmp_path.resolve()) - args: list[str] = ['download_scorefiles', '-i', 'PGS000001', '-b', 'GRCh38', '-o', out_dir] + args: list[str] = ['download_scorefiles', '-i', 'PGS000001', '-o', out_dir] with patch('sys.argv', args): download_scorefile() assert os.listdir(out_dir) == ['PGS000001.txt.gz'] +def test_download_scorefile_hmPOS(tmp_path): + out_dir = str(tmp_path.resolve()) + args: list[str] = ['download_scorefiles', '-i', 'PGS000001', '-b', 'GRCh38', '-o', out_dir] + + with patch('sys.argv', args): + download_scorefile() + assert os.listdir(out_dir) == ['PGS000001_hmPOS_GRCh38.txt.gz'] + def test_query_publication(): # publications are relatively static From 26d92525b1949f700372ac9f4b9c720829ed24ee Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 10 Aug 2022 13:37:58 +0100 Subject: [PATCH 10/47] fix tests --- conftest.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/conftest.py b/conftest.py index 08b0f9b..836853b 100644 --- a/conftest.py +++ b/conftest.py @@ -6,6 +6,7 @@ from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles from pysqlar import SQLiteArchive import pandas as pd +import glob @pytest.fixture(scope="session") @@ -21,11 +22,7 @@ def scorefiles(tmp_path_factory, pgs_accessions): with patch('sys.argv', args): download_scorefile() - paths: list[str] = [os.path.join(fn.resolve(), x + '.txt.gz') for x in pgs_accessions] - - assert all([os.path.exists(x) for x in paths]) - - return paths + return glob.glob(os.path.join(fn.resolve(), "*.txt.gz")) @pytest.fixture(scope="session") From 2c4aaf93bf3312b7ab4a7d41ec574056615b2bf5 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 10 Aug 2022 13:40:15 +0100 Subject: [PATCH 11/47] remove redundant database write --- pgscatalog_utils/scorefile/write.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index a2f04a8..9204096 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -22,7 +22,6 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None: logger.warning("No other allele information detected, writing out as missing data") out_df['other_allele'] = None - _write_log(out_df) out_df[cols].to_csv(path, index=False, sep="\t") @@ -33,20 +32,3 @@ def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame: else: return df - -def _write_log(df: pd.DataFrame) -> None: - logger.debug("Writing log to local database") - conn: sqlite3.Connection = sqlite3.connect('scorefiles.db') - - if 'liftover' not in df: - df = df.assign(liftover=None, lifted_chr=None, lifted_pos=None) - - cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', - 'accession', 'liftover', 'lifted_chr', 'lifted_pos'] - - # change some column types for sqlite - # nullable_ints: list[str] = ['liftover', 'lifted_chr', 'lifted_pos'] - # df[nullable_ints] = df[nullable_ints].astype(pd.Int64Dtype()) - df['other_allele'] = df['other_allele'].astype(str) - df[cols].to_sql('scorefile', conn, if_exists='replace') - conn.close() From 1141f666e2d700d038fb40061ecb0e98a1bb9627 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 11 Aug 2022 16:42:55 +0100 Subject: [PATCH 12/47] check split target chromosomes only contain one chromosome --- pgscatalog_utils/match/match_variants.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index d238848..9669ff8 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -38,6 +38,11 @@ def match_variants(): write_out(matches, args.split, args.outdir, dataset) +def _check_target_chroms(target) -> int: + n_chrom: int = len(target['#CHROM'].unique().to_list()) + if n_chrom > 1: + logger.critical(f"Multiple chromosomes detected in split file") + raise Exception def _parse_args(args=None): parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants') parser.add_argument('-d', '--dataset', dest='dataset', required=True, From feb4fd362e38d4a84f0b7c116471be504950021f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 11 Aug 2022 16:43:23 +0100 Subject: [PATCH 13/47] match single target genome variant files iteratively --- pgscatalog_utils/match/match.py | 5 ++- pgscatalog_utils/match/match_variants.py | 49 +++++++++++++++++------- pgscatalog_utils/match/read.py | 21 +++++++++- 3 files changed, 58 insertions(+), 17 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 7421a5f..f50027b 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -1,12 +1,13 @@ import polars as pl import logging +from pgscatalog_utils.match.postprocess import postprocess_matches from pgscatalog_utils.match.write import write_log logger = logging.getLogger(__name__) -def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame) -> pl.DataFrame: +def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame: scorefile_cat, target_cat = _cast_categorical(scorefile, target) scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) @@ -37,7 +38,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame) -> pl.DataFra matches.append(_match_variants(scorefile_no_oa, target_cat, effect_allele='ALT_FLIP', other_allele=None, match_type="no_oa_alt_flip")) - return pl.concat(matches) + return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> None: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 9669ff8..1012711 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -2,31 +2,32 @@ import logging import polars as pl from glob import glob -from pgscatalog_utils.match.postprocess import postprocess_matches from pgscatalog_utils.log_config import set_logging_level from pgscatalog_utils.match.match import get_all_matches, check_match_rate from pgscatalog_utils.match.read import read_target, read_scorefile from pgscatalog_utils.match.write import write_out +logger = logging.getLogger(__name__) + def match_variants(): args = _parse_args() - logger = logging.getLogger(__name__) set_logging_level(args.verbose) scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) with pl.StringCache(): - for i, loc_target_current in enumerate(glob(args.target)): - logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') - target: pl.DataFrame = read_target(path=loc_target_current, n_threads=args.n_threads, - remove_multiallelic=args.remove_multiallelic) - if i == 0: - matches: pl.DataFrame = get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous) - else: - matches: pl.DataFrame = pl.concat([matches, get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous)]) + n_target_files = len(glob(args.target)) + matches: list[pl.DataFrame] = [] + + if n_target_files == 1: + logger.debug("Single target variant file detected") + matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous) + else: + logger.debug("Multiple target variant files detected") + matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous) dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator check_match_rate(scorefile, matches, args.min_overlap, dataset) @@ -38,11 +39,36 @@ def match_variants(): write_out(matches, args.split, args.outdir, dataset) +def _match_multiple_targets(target_path, scorefile, remove_multiallelic, remove_ambiguous): + matches = [] + for i, loc_target_current in enumerate(glob(target_path)): + logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') + target: pl.DataFrame = read_target(path=loc_target_current, + remove_multiallelic=remove_multiallelic) + _check_target_chroms(target) + matches.append(get_all_matches(scorefile, target, remove_ambiguous)) + return pl.concat(matches) + + def _check_target_chroms(target) -> int: n_chrom: int = len(target['#CHROM'].unique().to_list()) if n_chrom > 1: logger.critical(f"Multiple chromosomes detected in split file") raise Exception + + +def _match_single_target(target_path, scorefile, remove_multiallelic, remove_ambiguous): + matches = [] + for chrom in scorefile['chr_name'].unique().to_list(): + target = read_target(target_path, remove_multiallelic=remove_multiallelic, + singie_file=True, chrom=chrom) + if target: + logger.debug(f"Matching chromosome {chrom}") + matches.append(get_all_matches(scorefile, target, remove_ambiguous)) + + return pl.concat(matches) + + def _parse_args(args=None): parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants') parser.add_argument('-d', '--dataset', dest='dataset', required=True, @@ -55,8 +81,6 @@ def _parse_args(args=None): help=' Split scorefile per chromosome?') parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') - parser.add_argument('-n', '--n_threads', dest='n_threads', default=1, type=int, - help=' Number of threads used to match (default = 1)') parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True, type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false', @@ -74,7 +98,6 @@ def _parse_args(args=None): if __name__ == "__main__": match_variants() - # join matches and scorefile with keys depending on liftover # count match type column # matches.groupby('accession').agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')]) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 6735456..906870e 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -7,10 +7,27 @@ logger = logging.getLogger(__name__) -def read_target(path: str, n_threads: int, remove_multiallelic: bool) -> pl.DataFrame: +def read_target(path: str, remove_multiallelic: bool, singie_file: bool = False, + chrom: str = "") -> pl.DataFrame: target: Target = _detect_target_format(path) d = {'column_1': str} # column_1 is always CHROM. CHROM must always be a string - df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, n_threads=n_threads) + logger.debug(f"Reading target {path}") + + if singie_file: + logger.debug(f"Scanning target genome for chromosome {chrom}") + df: pl.DataFrame = ( + pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, + low_memory=True) + .filter(pl.col('column_1') == chrom) + .collect()) + + if df.is_empty(): + logger.warning(f"Chromosome missing from target genome: {chrom}") + return df + else: + df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, + low_memory=True) + df.columns = target.header match target.file_format: From bff1d09a221856a9e431ae6bdf12eb399cfd9340 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 11 Aug 2022 16:50:39 +0100 Subject: [PATCH 14/47] tidy up --- pgscatalog_utils/match/match_variants.py | 22 ++++++++++++++-------- pgscatalog_utils/match/read.py | 13 +++++++------ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 1012711..1bb1bcb 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,8 +1,9 @@ import argparse import logging -import polars as pl from glob import glob +import polars as pl + from pgscatalog_utils.log_config import set_logging_level from pgscatalog_utils.match.match import get_all_matches, check_match_rate from pgscatalog_utils.match.read import read_target, read_scorefile @@ -16,11 +17,12 @@ def match_variants(): set_logging_level(args.verbose) + logger.debug(f"n_threads: {pl.threadpool_size()}") scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) with pl.StringCache(): n_target_files = len(glob(args.target)) - matches: list[pl.DataFrame] = [] + matches: pl.DataFrame if n_target_files == 1: logger.debug("Single target variant file detected") @@ -39,29 +41,33 @@ def match_variants(): write_out(matches, args.split, args.outdir, dataset) -def _match_multiple_targets(target_path, scorefile, remove_multiallelic, remove_ambiguous): +def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, + remove_ambiguous: bool) -> pl.DataFrame: matches = [] for i, loc_target_current in enumerate(glob(target_path)): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.DataFrame = read_target(path=loc_target_current, - remove_multiallelic=remove_multiallelic) + remove_multiallelic=remove_multiallelic) # _check_target_chroms(target) matches.append(get_all_matches(scorefile, target, remove_ambiguous)) return pl.concat(matches) -def _check_target_chroms(target) -> int: +def _check_target_chroms(target) -> None: n_chrom: int = len(target['#CHROM'].unique().to_list()) if n_chrom > 1: - logger.critical(f"Multiple chromosomes detected in split file") + logger.critical("Multiple chromosomes detected in split file. Check input data.") raise Exception + else: + logger.debug("Split target genome contains one chromosome (good)") -def _match_single_target(target_path, scorefile, remove_multiallelic, remove_ambiguous): +def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, + remove_ambiguous: bool) -> pl.DataFrame: matches = [] for chrom in scorefile['chr_name'].unique().to_list(): target = read_target(target_path, remove_multiallelic=remove_multiallelic, - singie_file=True, chrom=chrom) + singie_file=True, chrom=chrom) # scans and filters if target: logger.debug(f"Matching chromosome {chrom}") matches.append(get_all_matches(scorefile, target, remove_ambiguous)) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 906870e..8dc0d53 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -1,7 +1,9 @@ -import polars as pl -import logging import glob +import logging from typing import NamedTuple + +import polars as pl + from pgscatalog_utils.match.preprocess import ugly_complement, handle_multiallelic, check_weights logger = logging.getLogger(__name__) @@ -15,9 +17,9 @@ def read_target(path: str, remove_multiallelic: bool, singie_file: bool = False, if singie_file: logger.debug(f"Scanning target genome for chromosome {chrom}") + # scan target and filter to reduce memory usage on big files df: pl.DataFrame = ( - pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, - low_memory=True) + pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) .filter(pl.col('column_1') == chrom) .collect()) @@ -25,8 +27,7 @@ def read_target(path: str, remove_multiallelic: bool, singie_file: bool = False, logger.warning(f"Chromosome missing from target genome: {chrom}") return df else: - df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, - low_memory=True) + df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) df.columns = target.header From fdd9b79d370701904c7050ac11a6ce357cf093b7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 12 Aug 2022 09:44:57 +0100 Subject: [PATCH 15/47] revert iterative matching on a single file --- pgscatalog_utils/match/match.py | 5 +- pgscatalog_utils/match/match_variants.py | 61 +++++++----------------- pgscatalog_utils/match/read.py | 26 ++-------- 3 files changed, 22 insertions(+), 70 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index f50027b..7421a5f 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -1,13 +1,12 @@ import polars as pl import logging -from pgscatalog_utils.match.postprocess import postprocess_matches from pgscatalog_utils.match.write import write_log logger = logging.getLogger(__name__) -def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame: +def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame) -> pl.DataFrame: scorefile_cat, target_cat = _cast_categorical(scorefile, target) scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) @@ -38,7 +37,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu matches.append(_match_variants(scorefile_no_oa, target_cat, effect_allele='ALT_FLIP', other_allele=None, match_type="no_oa_alt_flip")) - return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) + return pl.concat(matches) def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> None: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 1bb1bcb..9669ff8 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,35 +1,32 @@ import argparse import logging -from glob import glob - import polars as pl +from glob import glob +from pgscatalog_utils.match.postprocess import postprocess_matches from pgscatalog_utils.log_config import set_logging_level from pgscatalog_utils.match.match import get_all_matches, check_match_rate from pgscatalog_utils.match.read import read_target, read_scorefile from pgscatalog_utils.match.write import write_out -logger = logging.getLogger(__name__) - def match_variants(): args = _parse_args() + logger = logging.getLogger(__name__) set_logging_level(args.verbose) - logger.debug(f"n_threads: {pl.threadpool_size()}") scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) with pl.StringCache(): - n_target_files = len(glob(args.target)) - matches: pl.DataFrame - - if n_target_files == 1: - logger.debug("Single target variant file detected") - matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous) - else: - logger.debug("Multiple target variant files detected") - matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous) + for i, loc_target_current in enumerate(glob(args.target)): + logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') + target: pl.DataFrame = read_target(path=loc_target_current, n_threads=args.n_threads, + remove_multiallelic=args.remove_multiallelic) + if i == 0: + matches: pl.DataFrame = get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous) + else: + matches: pl.DataFrame = pl.concat([matches, get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous)]) dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator check_match_rate(scorefile, matches, args.min_overlap, dataset) @@ -41,40 +38,11 @@ def match_variants(): write_out(matches, args.split, args.outdir, dataset) -def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool) -> pl.DataFrame: - matches = [] - for i, loc_target_current in enumerate(glob(target_path)): - logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') - target: pl.DataFrame = read_target(path=loc_target_current, - remove_multiallelic=remove_multiallelic) # - _check_target_chroms(target) - matches.append(get_all_matches(scorefile, target, remove_ambiguous)) - return pl.concat(matches) - - -def _check_target_chroms(target) -> None: +def _check_target_chroms(target) -> int: n_chrom: int = len(target['#CHROM'].unique().to_list()) if n_chrom > 1: - logger.critical("Multiple chromosomes detected in split file. Check input data.") + logger.critical(f"Multiple chromosomes detected in split file") raise Exception - else: - logger.debug("Split target genome contains one chromosome (good)") - - -def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool) -> pl.DataFrame: - matches = [] - for chrom in scorefile['chr_name'].unique().to_list(): - target = read_target(target_path, remove_multiallelic=remove_multiallelic, - singie_file=True, chrom=chrom) # scans and filters - if target: - logger.debug(f"Matching chromosome {chrom}") - matches.append(get_all_matches(scorefile, target, remove_ambiguous)) - - return pl.concat(matches) - - def _parse_args(args=None): parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants') parser.add_argument('-d', '--dataset', dest='dataset', required=True, @@ -87,6 +55,8 @@ def _parse_args(args=None): help=' Split scorefile per chromosome?') parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') + parser.add_argument('-n', '--n_threads', dest='n_threads', default=1, type=int, + help=' Number of threads used to match (default = 1)') parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True, type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false', @@ -104,6 +74,7 @@ def _parse_args(args=None): if __name__ == "__main__": match_variants() + # join matches and scorefile with keys depending on liftover # count match type column # matches.groupby('accession').agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')]) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 8dc0d53..6735456 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -1,34 +1,16 @@ -import glob +import polars as pl import logging +import glob from typing import NamedTuple - -import polars as pl - from pgscatalog_utils.match.preprocess import ugly_complement, handle_multiallelic, check_weights logger = logging.getLogger(__name__) -def read_target(path: str, remove_multiallelic: bool, singie_file: bool = False, - chrom: str = "") -> pl.DataFrame: +def read_target(path: str, n_threads: int, remove_multiallelic: bool) -> pl.DataFrame: target: Target = _detect_target_format(path) d = {'column_1': str} # column_1 is always CHROM. CHROM must always be a string - logger.debug(f"Reading target {path}") - - if singie_file: - logger.debug(f"Scanning target genome for chromosome {chrom}") - # scan target and filter to reduce memory usage on big files - df: pl.DataFrame = ( - pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) - .filter(pl.col('column_1') == chrom) - .collect()) - - if df.is_empty(): - logger.warning(f"Chromosome missing from target genome: {chrom}") - return df - else: - df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) - + df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, n_threads=n_threads) df.columns = target.header match target.file_format: From 96da74dce2dce817e4da87c785e06fbff96f7d0b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 12 Aug 2022 09:56:17 +0100 Subject: [PATCH 16/47] fix checking chromosomes --- pgscatalog_utils/match/match.py | 5 +++-- pgscatalog_utils/match/match_variants.py | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 7421a5f..f50027b 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -1,12 +1,13 @@ import polars as pl import logging +from pgscatalog_utils.match.postprocess import postprocess_matches from pgscatalog_utils.match.write import write_log logger = logging.getLogger(__name__) -def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame) -> pl.DataFrame: +def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame: scorefile_cat, target_cat = _cast_categorical(scorefile, target) scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) @@ -37,7 +38,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame) -> pl.DataFra matches.append(_match_variants(scorefile_no_oa, target_cat, effect_allele='ALT_FLIP', other_allele=None, match_type="no_oa_alt_flip")) - return pl.concat(matches) + return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> None: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 9669ff8..9d72d12 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -10,24 +10,29 @@ from pgscatalog_utils.match.write import write_out +logger = logging.getLogger(__name__) + + def match_variants(): args = _parse_args() - logger = logging.getLogger(__name__) set_logging_level(args.verbose) scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) with pl.StringCache(): + match_lst = [] + for i, loc_target_current in enumerate(glob(args.target)): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.DataFrame = read_target(path=loc_target_current, n_threads=args.n_threads, remove_multiallelic=args.remove_multiallelic) - if i == 0: - matches: pl.DataFrame = get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous) - else: - matches: pl.DataFrame = pl.concat([matches, get_all_matches(scorefile, target).pipe(postprocess_matches, args.remove_ambiguous)]) + match_lst.append(get_all_matches(scorefile, target, args.remove_ambiguous)) + if len(glob(args.target)) > 1: + _check_target_chroms(target) + + matches: pl.DataFrame = pl.concat(match_lst) dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator check_match_rate(scorefile, matches, args.min_overlap, dataset) @@ -43,6 +48,8 @@ def _check_target_chroms(target) -> int: if n_chrom > 1: logger.critical(f"Multiple chromosomes detected in split file") raise Exception + + def _parse_args(args=None): parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants') parser.add_argument('-d', '--dataset', dest='dataset', required=True, From 9a832ff968be42dbc1d2b4e06d6296fc14040099 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 12 Aug 2022 12:52:40 +0100 Subject: [PATCH 17/47] Update matching modes --- pgscatalog_utils/match/match_variants.py | 97 ++++++++++++++++++------ pgscatalog_utils/match/read.py | 26 ++++++- 2 files changed, 96 insertions(+), 27 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 9d72d12..4af50ac 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,15 +1,14 @@ import argparse import logging -import polars as pl from glob import glob -from pgscatalog_utils.match.postprocess import postprocess_matches + +import polars as pl from pgscatalog_utils.log_config import set_logging_level from pgscatalog_utils.match.match import get_all_matches, check_match_rate from pgscatalog_utils.match.read import read_target, read_scorefile from pgscatalog_utils.match.write import write_out - logger = logging.getLogger(__name__) @@ -18,21 +17,39 @@ def match_variants(): set_logging_level(args.verbose) + logger.debug(f"n_threads: {pl.threadpool_size()}") scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) with pl.StringCache(): - match_lst = [] + n_target_files = len(glob(args.target)) + matches: pl.DataFrame + + if n_target_files == 1 and not args.fast: + match_mode: str = 'single' + elif n_target_files > 1 and not args.fast: + match_mode: str = 'multi' + elif args.fast: + match_mode: str = 'fast' + + match match_mode: + case "single": + logger.debug(f"Match mode: {match_mode}") + matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous) + case "multi": + logger.debug(f"Match mode: {match_mode}") + matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, + args.remove_ambiguous) + case "fast": + logger.debug(f"Match mode: {match_mode}") + check_chrom: bool = False + if n_target_files > 1: + check_chrom = True + matches = _fast_match(args.target, scorefile, args.remove_multiallelic, + args.remove_ambiguous, check_chrom) + case _: + logger.critical(f"Invalid match mode: {match_mode}") + raise Exception - for i, loc_target_current in enumerate(glob(args.target)): - logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') - target: pl.DataFrame = read_target(path=loc_target_current, n_threads=args.n_threads, - remove_multiallelic=args.remove_multiallelic) - match_lst.append(get_all_matches(scorefile, target, args.remove_ambiguous)) - - if len(glob(args.target)) > 1: - _check_target_chroms(target) - - matches: pl.DataFrame = pl.concat(match_lst) dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator check_match_rate(scorefile, matches, args.min_overlap, dataset) @@ -43,11 +60,50 @@ def match_variants(): write_out(matches, args.split, args.outdir, dataset) -def _check_target_chroms(target) -> int: +def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, + remove_ambiguous: bool, check_chrom: bool) -> pl.DataFrame: + # fast match is fast because: + # 1) all target files are read into memory + # 2) matching occurs without iterating through chromosomes + target: pl.DataFrame = read_target(path=target_path, + remove_multiallelic=remove_multiallelic) + if check_chrom: + _check_target_chroms(target) + return get_all_matches(scorefile, target, remove_ambiguous) + + +def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, + remove_ambiguous: bool) -> pl.DataFrame: + matches = [] + for i, loc_target_current in enumerate(glob(target_path)): + logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') + target: pl.DataFrame = read_target(path=loc_target_current, + remove_multiallelic=remove_multiallelic) # + _check_target_chroms(target) + matches.append(get_all_matches(scorefile, target, remove_ambiguous)) + return pl.concat(matches) + + +def _check_target_chroms(target) -> None: n_chrom: int = len(target['#CHROM'].unique().to_list()) if n_chrom > 1: - logger.critical(f"Multiple chromosomes detected in split file") + logger.critical("Multiple chromosomes detected in split file. Check input data.") raise Exception + else: + logger.debug("Split target genome contains one chromosome (good)") + + +def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, + remove_ambiguous: bool) -> pl.DataFrame: + matches = [] + for chrom in scorefile['chr_name'].unique().to_list(): + target = read_target(target_path, remove_multiallelic=remove_multiallelic, + singie_file=True, chrom=chrom) # scans and filters + if target: + logger.debug(f"Matching chromosome {chrom}") + matches.append(get_all_matches(scorefile, target, remove_ambiguous)) + + return pl.concat(matches) def _parse_args(args=None): @@ -58,12 +114,12 @@ def _parse_args(args=None): help=' Combined scorefile path (output of read_scorefiles.py)') parser.add_argument('-t', '--target', dest='target', required=True, help=' A table of target genomic variants (.bim format)') + parser.add_argument('-f', '--fast', dest='fast', action='store_true', + help=' Enable faster matching at the cost of increased RAM usage') parser.add_argument('--split', dest='split', default=False, action='store_true', help=' Split scorefile per chromosome?') parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') - parser.add_argument('-n', '--n_threads', dest='n_threads', default=1, type=int, - help=' Number of threads used to match (default = 1)') parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True, type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false', @@ -80,8 +136,3 @@ def _parse_args(args=None): if __name__ == "__main__": match_variants() - - -# join matches and scorefile with keys depending on liftover -# count match type column -# matches.groupby('accession').agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')]) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 6735456..8dc0d53 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -1,16 +1,34 @@ -import polars as pl -import logging import glob +import logging from typing import NamedTuple + +import polars as pl + from pgscatalog_utils.match.preprocess import ugly_complement, handle_multiallelic, check_weights logger = logging.getLogger(__name__) -def read_target(path: str, n_threads: int, remove_multiallelic: bool) -> pl.DataFrame: +def read_target(path: str, remove_multiallelic: bool, singie_file: bool = False, + chrom: str = "") -> pl.DataFrame: target: Target = _detect_target_format(path) d = {'column_1': str} # column_1 is always CHROM. CHROM must always be a string - df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d, n_threads=n_threads) + logger.debug(f"Reading target {path}") + + if singie_file: + logger.debug(f"Scanning target genome for chromosome {chrom}") + # scan target and filter to reduce memory usage on big files + df: pl.DataFrame = ( + pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) + .filter(pl.col('column_1') == chrom) + .collect()) + + if df.is_empty(): + logger.warning(f"Chromosome missing from target genome: {chrom}") + return df + else: + df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) + df.columns = target.header match target.file_format: From f5e1c9ed9a998066846bcd51ed0cf448b10b197d Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 15 Aug 2022 16:17:26 +0100 Subject: [PATCH 18/47] Change default behaviour of combine_scorefiles to not drop variants with missing chromosomal positions or non-standard alleles (e.g. HLA) --- pgscatalog_utils/scorefile/combine_scorefiles.py | 9 ++++++--- pgscatalog_utils/scorefile/qc.py | 14 +++++++++----- pgscatalog_utils/scorefile/read.py | 4 ++-- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 5ec21d4..02d0ca9 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -24,6 +24,9 @@ def parse_args(args=None) -> argparse.Namespace: parser.add_argument('-m', '--min_lift', dest='min_lift', help='If liftover, minimum proportion of variants lifted over', required="--liftover" in sys.argv, default=0.95, type=float) + parser.add_argument('--drop_missing', dest='drop_missing', action='store_true', + help='Drop variants with missing information (chr/pos) and ' + 'non-standard alleles from the output file.') parser.add_argument('-o', '--outfile', dest='outfile', required=True, default='combined.txt', help=' Output path to combined long scorefile') @@ -40,7 +43,7 @@ def combine_scorefiles(): paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") - scorefiles: pd.DataFrame = pd.concat([_read_and_melt(x) for x in paths]) + scorefiles: pd.DataFrame = pd.concat([_read_and_melt(x, drop_missing=args.drop_missing) for x in paths]) if args.liftover: logger.debug("Annotating scorefiles with liftover parameters") @@ -49,9 +52,9 @@ def combine_scorefiles(): write_scorefile(scorefiles, args.outfile) -def _read_and_melt(path): +def _read_and_melt(path, drop_missing: bool = False): """ Load a scorefile, melt it, and set the effect types""" - return (load_scorefile(path) + return (load_scorefile(path, drop_missing=drop_missing) .pipe(melt_effect_weights) .pipe(set_effect_type)) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index fe2b725..4316f1e 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -4,15 +4,19 @@ logger = logging.getLogger(__name__) -def quality_control(df: pd.DataFrame) -> pd.DataFrame: +def quality_control(df: pd.DataFrame, drop_missing: bool) -> pd.DataFrame: """ Do quality control checks on a scorefile """ _check_shape(df) _check_columns(df) logger.debug("Quality control: checking for bad variants") - return (df.pipe(_drop_hla) - .pipe(_drop_missing_variants) - .pipe(_check_duplicate_identifiers) - .pipe(_drop_multiple_oa)) + if drop_missing is True: + return (df.pipe(_drop_hla) + .pipe(_drop_missing_variants) + .pipe(_check_duplicate_identifiers) + .pipe(_drop_multiple_oa)) + else: + return (df.pipe(_check_duplicate_identifiers) + .pipe(_drop_multiple_oa)) def _drop_multiple_oa(df: pd.DataFrame) -> pd.DataFrame: diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py index 43fe176..7674c7c 100644 --- a/pgscatalog_utils/scorefile/read.py +++ b/pgscatalog_utils/scorefile/read.py @@ -7,13 +7,13 @@ logger = logging.getLogger(__name__) -def load_scorefile(path: str, use_harmonised: bool = True) -> pd.DataFrame: +def load_scorefile(path: str, use_harmonised: bool = True, drop_missing: bool = False) -> pd.DataFrame: logger.debug(f'Reading scorefile {path}') return (pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) .pipe(remap_harmonised, use_harmonised=use_harmonised) .assign(filename_prefix=_get_basename(path), filename=path) - .pipe(quality_control)) + .pipe(quality_control, drop_missing=drop_missing)) def _scorefile_dtypes() -> dict[str]: From 137596c110fcd8b806adf5afa17539e53b414466 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 15 Aug 2022 16:25:04 +0100 Subject: [PATCH 19/47] Fix typo --- pgscatalog_utils/match/match_variants.py | 2 +- pgscatalog_utils/match/read.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 4af50ac..bc8a4cd 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -98,7 +98,7 @@ def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multi matches = [] for chrom in scorefile['chr_name'].unique().to_list(): target = read_target(target_path, remove_multiallelic=remove_multiallelic, - singie_file=True, chrom=chrom) # scans and filters + single_file=True, chrom=chrom) # scans and filters if target: logger.debug(f"Matching chromosome {chrom}") matches.append(get_all_matches(scorefile, target, remove_ambiguous)) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 8dc0d53..025180a 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -9,13 +9,13 @@ logger = logging.getLogger(__name__) -def read_target(path: str, remove_multiallelic: bool, singie_file: bool = False, +def read_target(path: str, remove_multiallelic: bool, single_file: bool = False, chrom: str = "") -> pl.DataFrame: target: Target = _detect_target_format(path) d = {'column_1': str} # column_1 is always CHROM. CHROM must always be a string logger.debug(f"Reading target {path}") - if singie_file: + if single_file: logger.debug(f"Scanning target genome for chromosome {chrom}") # scan target and filter to reduce memory usage on big files df: pl.DataFrame = ( From f4626f276ca5fdacf24b917e722e8ba1d32f5d19 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 15 Aug 2022 18:24:04 +0100 Subject: [PATCH 20/47] Update function to complement alleles. Will not complement missing/complex alleles that don't match valid DNA string. Examples: - null -> null - G*06:02 -> G*06:02 - A -> T - AC -> TG - CT -> GA - C -> G - etc. --- pgscatalog_utils/match/preprocess.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 3c5ce73..916d06d 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -31,6 +31,28 @@ def ugly_complement(df: pl.DataFrame) -> pl.DataFrame: ]) +def complement_valid_alleles(df: pl.DataFrame, flip_cols = []) -> pl.DataFrame: + """ Improved function to complement alleles. Will only complement sequences that are valid DNA. + Uses same method ugly_complement (str.replace_all) above. + """ + for col in flip_cols: + new_col = col + '_FLIP' + df = df.with_column( + pl.when(pl.col(col).str.contains('^[ACGT]*$')) + .then(pl.col(col).str.replace_all("A", "V") + .str.replace_all("T", "X") + .str.replace_all("C", "Y") + .str.replace_all("G", "Z") + .str.replace_all("V", "T") + .str.replace_all("X", "A") + .str.replace_all("Y", "G") + .str.replace_all("Z", "C")) + .otherwise(pl.col(col)) + .alias(new_col) + ) + return df + + def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) -> pl.DataFrame: # plink2 pvar multi-alleles are comma-separated df: pl.DataFrame = (df.with_column( From 3e43c8ffb781c089d428c1e0c0a588c4f7744160 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 16 Aug 2022 09:38:16 +0100 Subject: [PATCH 21/47] Fix regex to 1 or more from zero or more --- pgscatalog_utils/match/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 916d06d..2af24d8 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -38,7 +38,7 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols = []) -> pl.DataFrame: for col in flip_cols: new_col = col + '_FLIP' df = df.with_column( - pl.when(pl.col(col).str.contains('^[ACGT]*$')) + pl.when(pl.col(col).str.contains('^[ACGT]+$')) .then(pl.col(col).str.replace_all("A", "V") .str.replace_all("T", "X") .str.replace_all("C", "Y") From 369a69ad5799b32933a7cdc8cf5fcc687d3e7a70 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 16 Aug 2022 09:56:18 +0100 Subject: [PATCH 22/47] Suggested argument to ignore strand_flips (default behaviour is to include them) --- pgscatalog_utils/match/match_variants.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index bc8a4cd..0b114c2 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -129,6 +129,9 @@ def _parse_args(args=None): 'statistics were used to construct the score.'), parser.add_argument('--keep_multiallelic', dest='remove_multiallelic', action='store_false', help='Flag to allow matching to multiallelic variants (default: false).') + parser.add_argument('--ignore_strand_flips', dest='consider_strand_flips', action='store_false', + help='Flag to not consider matched variants that may be reported on the opposite strand. ' + 'Default behaviour is to flip/complement unmatched variants and check if they match.') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') return parser.parse_args(args) From fd382afc23fcf6aeb80a08948f69b6f09b10944d Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 16 Aug 2022 10:28:36 +0100 Subject: [PATCH 23/47] Flip scorefile instead of target data --- pgscatalog_utils/match/read.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 025180a..5579bc8 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -4,7 +4,7 @@ import polars as pl -from pgscatalog_utils.match.preprocess import ugly_complement, handle_multiallelic, check_weights +from pgscatalog_utils.match.preprocess import handle_multiallelic, check_weights, complement_valid_alleles logger = logging.getLogger(__name__) @@ -35,11 +35,9 @@ def read_target(path: str, remove_multiallelic: bool, single_file: bool = False, case 'bim': return (df[_default_cols()] .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=False) - .pipe(ugly_complement)) case 'pvar': return (df[_default_cols()] .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=True) - .pipe(ugly_complement)) case _: logger.error("Invalid file format detected") raise Exception @@ -47,7 +45,9 @@ def read_target(path: str, remove_multiallelic: bool, single_file: bool = False, def read_scorefile(path: str) -> pl.DataFrame: logger.debug("Reading scorefile") - scorefile: pl.DataFrame = pl.read_csv(path, sep='\t', dtype={'chr_name': str}) + scorefile: pl.DataFrame = pl.read_csv(path, sep='\t', dtype={'chr_name': str}).pipe(complement_valid_alleles, + flip_cols=['effect_allele', + 'other_allele']) check_weights(scorefile) return scorefile From b926d2d16605df70629e049a2b5140d91cc70245 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 16 Aug 2022 11:15:18 +0100 Subject: [PATCH 24/47] get best match type --- pgscatalog_utils/match/postprocess.py | 43 +++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index adb4932..cd1eebd 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -1,3 +1,4 @@ +from functools import reduce import polars as pl import logging @@ -40,14 +41,46 @@ def _get_distinct_weights(df: pl.DataFrame) -> pl.DataFrame: singletons: pl.DataFrame = (count.filter(pl.col('count') == 1)[:, "accession":"effect_allele"] .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) - # TODO: something more complex than .unique()? - # TODO: prioritise unambiguous -> ref -> alt -> ref_flip -> alt_flip dups: pl.DataFrame = (count.filter(pl.col('count') > 1)[:, "accession":"effect_allele"] - .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left') - .distinct(subset=['accession', 'chr_name', 'chr_position', 'effect_allele'])) - distinct: pl.DataFrame = pl.concat([singletons, dups]) + .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) + + distinct: pl.DataFrame = pl.concat([singletons, _prioritise_match_type(dups)]) assert all((distinct.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count()['count']) == 1), \ "Duplicate effect weights for a variant" return distinct + + +def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame: + dup_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") != None) + dup_no_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") == None) + best_matches: list[pl.DataFrame] = [] + + if dup_oa: + match_priority: list[str] = ['refalt', 'altref', 'refalt_flip', 'altref_flip'] + logger.debug(f"Prioritising matches in order {match_priority}") + best_matches.append(_get_best_match(dup_oa, match_priority)) + + if dup_no_oa: + match_priority: list[str] = ['no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip', 'no_oa_alt_flip'] + logger.debug(f"Prioritising matches in order {match_priority}") + best_matches.append(_get_best_match(dup_no_oa, match_priority)) + + return pl.concat(best_matches) + + +def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame: + match: list[pl.DataFrame] = [] + for match_type in match_priority: + match.append(df.filter(pl.col("match_type") == match_type)) + logger.debug("Filtering best match types") + return reduce(lambda x, y: _join_best_match(x, y), match) + + +def _join_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame: + # variants in dataframe x have a higher priority than dataframe y + # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x + not_in: pl.DataFrame = y.join(x, how='anti', + on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele']) + return pl.concat([x, not_in]) From b5c8531a0ae230e47f98ba7160846f09b4bd98a8 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 16 Aug 2022 11:15:36 +0100 Subject: [PATCH 25/47] move debug statement --- pgscatalog_utils/match/read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 8dc0d53..450ad69 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -13,7 +13,6 @@ def read_target(path: str, remove_multiallelic: bool, singie_file: bool = False, chrom: str = "") -> pl.DataFrame: target: Target = _detect_target_format(path) d = {'column_1': str} # column_1 is always CHROM. CHROM must always be a string - logger.debug(f"Reading target {path}") if singie_file: logger.debug(f"Scanning target genome for chromosome {chrom}") @@ -27,6 +26,7 @@ def read_target(path: str, remove_multiallelic: bool, singie_file: bool = False, logger.warning(f"Chromosome missing from target genome: {chrom}") return df else: + logger.debug(f"Reading target {path}") df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) df.columns = target.header From b2bc27cea20175b25f39cf69f25d23e846843c57 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 16 Aug 2022 13:43:07 +0100 Subject: [PATCH 26/47] match with complemented scorefile columns instead of target columns --- pgscatalog_utils/match/match.py | 104 +++++++++++++++----------- pgscatalog_utils/match/postprocess.py | 4 +- pgscatalog_utils/match/preprocess.py | 31 +------- pgscatalog_utils/match/read.py | 9 +-- 4 files changed, 68 insertions(+), 80 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index f50027b..779f431 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -16,27 +16,17 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu if scorefile_oa: logger.debug("Getting matches for scores with effect allele and other allele") - matches.append(_match_variants(scorefile_cat, target_cat, effect_allele='REF', other_allele='ALT', - match_type="refalt")) - matches.append(_match_variants(scorefile_cat, target_cat, effect_allele='ALT', other_allele='REF', - match_type="altref")) - matches.append(_match_variants(scorefile_cat, target_cat, effect_allele='REF_FLIP', - other_allele='ALT_FLIP', - match_type="refalt_flip")) - matches.append(_match_variants(scorefile_cat, target_cat, effect_allele='ALT_FLIP', - other_allele='REF_FLIP', - match_type="altref_flip")) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt")) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref")) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip")) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip")) if scorefile_no_oa: logger.debug("Getting matches for scores with effect allele only") - matches.append(_match_variants(scorefile_no_oa, target_cat, effect_allele='REF', other_allele=None, - match_type="no_oa_ref")) - matches.append(_match_variants(scorefile_no_oa, target_cat, effect_allele='ALT', other_allele=None, - match_type="no_oa_alt")) - matches.append(_match_variants(scorefile_no_oa, target_cat, effect_allele='REF_FLIP', - other_allele=None, match_type="no_oa_ref_flip")) - matches.append(_match_variants(scorefile_no_oa, target_cat, effect_allele='ALT_FLIP', - other_allele=None, match_type="no_oa_alt_flip")) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref")) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt")) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip")) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip")) return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) @@ -54,9 +44,32 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()): if rate < (1 - min_overlap): - logger.debug(f"Score {accession} passes minimum matching threshold ({1-rate:.2%} variants match)") + logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%} variants match)") else: - logger.error(f"Score {accession} fails minimum matching threshold ({1-rate:.2%} variants match)") + logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)") + raise Exception + + +def _get_match_keys(strategy: str) -> tuple[list[str], list[str]]: + match strategy: + case 'refalt': + return _scorefile_keys('effect_allele', 'other_allele'), _target_keys() + case 'altref': + return _scorefile_keys('other_allele', 'effect_allele'), _target_keys() + case 'refalt_flip': + return _scorefile_keys('effect_allele_FLIP', 'other_allele_FLIP'), _target_keys() + case 'altref_flip': + return _scorefile_keys('other_allele_FLIP', 'effect_allele_FLIP'), _target_keys() + case 'no_oa_ref': + return _scorefile_keys('effect_allele', ''), _target_keys(False) + case 'no_oa_alt': + return _scorefile_keys('other_allele', ''), _target_keys(False) + case 'no_oa_ref_flip': + return _scorefile_keys('effect_allele_FLIP', ''), _target_keys(False) + case 'no_oa_alt_flip': + return _scorefile_keys('other_allele_FLIP', ''), _target_keys(False) + case _: + logger.critical(f"Invalid match strategy: {strategy}") raise Exception @@ -71,28 +84,31 @@ def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str): def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, - effect_allele: str, - other_allele: str | None, match_type: str) -> pl.DataFrame: logger.debug(f"Matching strategy: {match_type}") + score_key, target_key = _get_match_keys(match_type) return (scorefile.join(target, - left_on=_scorefile_keys(other_allele), - right_on=_target_keys(effect_allele, other_allele), - how='inner')).pipe(_post_match, effect_allele, other_allele, match_type) + left_on=score_key, + right_on=target_key, + how='inner').pipe(_post_match, target_key, match_type)) def _post_match(df: pl.DataFrame, - effect_allele: str, - other_allele: str, + target_keys: list[str], match_type: str) -> pl.DataFrame: """ Annotate matches with parameters """ - if other_allele is None: + if len(target_keys) == 3: logger.debug("Dropping missing other_allele during annotation") - other_allele = 'dummy' # prevent trying to alias a column to None + ref_key = target_keys[-1] + alt_key = 'dummy' # prevent trying to alias a column to None + else: + ref_key = target_keys[-2] + alt_key = target_keys[-1] + # aliases keep a copy of columns dropped during the join return df.with_columns([pl.col("*"), - pl.col("effect_allele").alias(effect_allele), - pl.col("other_allele").alias(other_allele), + pl.col("effect_allele").alias(ref_key), + pl.col("other_allele").alias(alt_key), pl.lit(match_type).alias("match_type"), ])[_matched_colnames()] @@ -104,33 +120,33 @@ def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: pl.col("effect_allele").cast(pl.Categorical), pl.col("other_allele").cast(pl.Categorical), pl.col("effect_type").cast(pl.Categorical), + pl.col("effect_allele_FLIP").cast(pl.Categorical), + pl.col("other_allele_FLIP").cast(pl.Categorical), pl.col("accession").cast(pl.Categorical) ]) if target: target = target.with_columns([ pl.col("REF").cast(pl.Categorical), - pl.col("ALT").cast(pl.Categorical), - pl.col("ALT_FLIP").cast(pl.Categorical), - pl.col("REF_FLIP").cast(pl.Categorical) + pl.col("ALT").cast(pl.Categorical) ]) return scorefile, target -def _scorefile_keys(other_allele: str) -> list[str]: - if other_allele: - return ['chr_name', 'chr_position', 'effect_allele', 'other_allele'] +def _scorefile_keys(ref_key: str, alt_key: str) -> list[str]: + if alt_key: + return ['chr_name', 'chr_position', ref_key, alt_key] else: - return ['chr_name', 'chr_position', 'effect_allele'] + return ['chr_name', 'chr_position', ref_key] -def _target_keys(effect_allele: str, other_allele: str) -> list[str]: - if other_allele: - return ['#CHROM', 'POS', effect_allele, other_allele] +def _target_keys(alt_key: bool = True) -> list[str]: + if alt_key: + return ['#CHROM', 'POS', "REF", "ALT"] else: - return ['#CHROM', 'POS', effect_allele] + return ['#CHROM', 'POS', "REF"] def _matched_colnames() -> list[str]: - return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession', - 'ID', 'REF', 'ALT', 'REF_FLIP', 'ALT_FLIP', 'match_type', 'is_multiallelic'] + return ['chr_name', 'chr_position', 'effect_allele', 'effect_allele_FLIP', 'other_allele', 'other_allele_FLIP', + 'effect_weight', 'effect_type', 'accession', 'ID', 'REF', 'ALT', 'match_type', 'is_multiallelic'] diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index adb4932..ce33483 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -24,12 +24,12 @@ def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFram def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: # A / T or C / G may match multiple times df = df.with_columns([ - pl.col(["effect_allele", "other_allele", "REF", "ALT", "REF_FLIP", "ALT_FLIP"]).cast(str), + pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), pl.lit(True).alias("ambiguous") ]) return (df.with_column( - pl.when((pl.col("effect_allele") == pl.col("ALT_FLIP")) | (pl.col("effect_allele") == pl.col("REF_FLIP"))) + pl.when((pl.col("effect_allele_FLIP") == pl.col("ALT")) | (pl.col("effect_allele_FLIP") == pl.col("REF"))) .then(pl.col("ambiguous")) .otherwise(False))).pipe(_get_distinct_weights) diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 2af24d8..0be878f 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -4,38 +4,11 @@ logger = logging.getLogger(__name__) -def ugly_complement(df: pl.DataFrame) -> pl.DataFrame: - """ Complementing alleles with a pile of regexes seems weird, but polars string functions are currently limited - (i.e. no str.translate). This is fast, and I stole the regex idea from Scott. - """ - logger.debug("Complementing target alleles") - return df.with_columns([ - (pl.col("REF").str.replace_all("A", "V") - .str.replace_all("T", "X") - .str.replace_all("C", "Y") - .str.replace_all("G", "Z") - .str.replace_all("V", "T") - .str.replace_all("X", "A") - .str.replace_all("Y", "G") - .str.replace_all("Z", "C")) - .alias("REF_FLIP"), - (pl.col("ALT").str.replace_all("A", "V") - .str.replace_all("T", "X") - .str.replace_all("C", "Y") - .str.replace_all("G", "Z") - .str.replace_all("V", "T") - .str.replace_all("X", "A") - .str.replace_all("Y", "G") - .str.replace_all("Z", "C")) - .alias("ALT_FLIP") - ]) - - -def complement_valid_alleles(df: pl.DataFrame, flip_cols = []) -> pl.DataFrame: +def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataFrame: """ Improved function to complement alleles. Will only complement sequences that are valid DNA. - Uses same method ugly_complement (str.replace_all) above. """ for col in flip_cols: + logger.debug(f"Complementing scorefile column {col}") new_col = col + '_FLIP' df = df.with_column( pl.when(pl.col(col).str.contains('^[ACGT]+$')) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 5579bc8..8357782 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -34,10 +34,10 @@ def read_target(path: str, remove_multiallelic: bool, single_file: bool = False, match target.file_format: case 'bim': return (df[_default_cols()] - .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=False) + .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=False)) case 'pvar': return (df[_default_cols()] - .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=True) + .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=True)) case _: logger.error("Invalid file format detected") raise Exception @@ -45,9 +45,8 @@ def read_target(path: str, remove_multiallelic: bool, single_file: bool = False, def read_scorefile(path: str) -> pl.DataFrame: logger.debug("Reading scorefile") - scorefile: pl.DataFrame = pl.read_csv(path, sep='\t', dtype={'chr_name': str}).pipe(complement_valid_alleles, - flip_cols=['effect_allele', - 'other_allele']) + scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str}) + .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])) check_weights(scorefile) return scorefile From 7859ef72ccdf170e52016e7417b8964f34c03232 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 16 Aug 2022 14:13:14 +0100 Subject: [PATCH 27/47] fix liftover with missing positions --- conftest.py | 2 +- pgscatalog_utils/download/score.py | 14 +++++++------- pgscatalog_utils/scorefile/liftover.py | 14 ++++++++------ tests/test_combine.py | 14 ++++++++++++-- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/conftest.py b/conftest.py index 836853b..a30f2cd 100644 --- a/conftest.py +++ b/conftest.py @@ -114,7 +114,7 @@ def chain_files(db, tmp_path_factory): def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory): out_path = tmp_path_factory.mktemp("scores") / "lifted.txt" args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['--liftover', '-c', chain_files, '-t', 'GRCh38', - '-m', '0.95'] + ['-o', str(out_path.resolve())] + '-m', '0.8'] + ['-o', str(out_path.resolve())] with patch('sys.argv', args): combine_scorefiles() diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py index 546cc6f..61a0154 100644 --- a/pgscatalog_utils/download/score.py +++ b/pgscatalog_utils/download/score.py @@ -12,7 +12,7 @@ def get_url(pgs: list[str], build: str) -> dict[str, str]: for chunk in _chunker(pgs): try: - response = _parse_json_query(_query_score(chunk), build) + response = _parse_json_query(query_score(chunk), build) pgs_result = pgs_result + list(response.keys()) url_result = url_result + list(response.values()) except TypeError: @@ -27,18 +27,18 @@ def get_url(pgs: list[str], build: str) -> dict[str, str]: return dict(zip(pgs_result, url_result)) -def _chunker(pgs: list[str]): - size = 50 # /rest/score/{pgs_id} limit when searching multiple IDs - return(pgs[pos: pos + size] for pos in range(0, len(pgs), size)) - - -def _query_score(pgs_id: list[str]) -> dict: +def query_score(pgs_id: list[str]) -> dict: pgs: str = ','.join(pgs_id) api: str = f'https://www.pgscatalog.org/rest/score/search?pgs_ids={pgs}' r: requests.models.Response = requests.get(api) return r.json() +def _chunker(pgs: list[str]): + size = 50 # /rest/score/{pgs_id} limit when searching multiple IDs + return(pgs[pos: pos + size] for pos in range(0, len(pgs), size)) + + def _parse_json_query(json: dict, build: str | None) -> dict[str, str]: result = jq.compile(".results").input(json).first() if not result: diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 6390afb..0d3008c 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -62,12 +62,14 @@ def _check_min_liftover(mapped: pd.DataFrame, unmapped: pd.DataFrame, min_lift: def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver]) -> pd.Series: """ Convert genomic coordinates to different build """ - - lo = lo_dict[df['genome_build'] + df['target_build']] # extract lo object from dict - chrom: str = 'chr' + str(df['chr_name']) - pos: int = int(df['chr_position']) - 1 # liftOver is 0 indexed, VCF is 1 indexed - # converted example: [('chr22', 15460378, '+', 3320966530)] or None - converted: list[tuple[str, int, str, int] | None] = lo.convert_coordinate(chrom, pos) + if df[['chr_name', 'chr_position']].isnull().values.any(): + converted = None + else: + lo = lo_dict[df['genome_build'] + df['target_build']] # extract lo object from dict + chrom: str = 'chr' + str(df['chr_name']) + pos: int = int(df['chr_position']) - 1 # liftOver is 0 indexed, VCF is 1 indexed + # converted example: [('chr22', 15460378, '+', 3320966530)] or None + converted: list[tuple[str, int, str, int] | None] = lo.convert_coordinate(chrom, pos) if converted: lifted_chrom: str = _parse_lifted_chrom(converted[0][0][3:]) # return first matching liftover diff --git a/tests/test_combine.py b/tests/test_combine.py index 86fb824..6243cef 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -1,14 +1,24 @@ import pandas as pd +import pytest +import jq +from pgscatalog_utils.download.score import query_score -def test_combine_scorefiles(combined_scorefile): + +def test_combine_scorefiles(combined_scorefile, _n_variants): df = pd.read_table(combined_scorefile) cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession'} assert set(df.columns).issubset(cols) - assert df.shape[0] == 51215 # combined number of variants + assert df.shape[0] == _n_variants def test_liftover(lifted_scorefiles): df = pd.read_table(lifted_scorefiles) assert df.shape[0] > 50000 # approx size + +@pytest.fixture +def _n_variants(pgs_accessions): + json = query_score(pgs_accessions) + n: list[int] = jq.compile("[.results][][].variants_number").input(json).all() + return sum(n) From f2a21949a85b0261b57448306ba9225ea90c5d5b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 16 Aug 2022 14:27:22 +0100 Subject: [PATCH 28/47] skip flips with parameter --- pgscatalog_utils/match/match_variants.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 0b114c2..b98149d 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -17,7 +17,7 @@ def match_variants(): set_logging_level(args.verbose) - logger.debug(f"n_threads: {pl.threadpool_size()}") + logger.debug(f"polars n_threads: {pl.threadpool_size()}") scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) with pl.StringCache(): @@ -50,6 +50,10 @@ def match_variants(): logger.critical(f"Invalid match mode: {match_mode}") raise Exception + if args.skip_flip: + logger.debug("Ignoring strand flip matches") + matches = _drop_flips(matches) + dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator check_match_rate(scorefile, matches, args.min_overlap, dataset) @@ -106,6 +110,10 @@ def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multi return pl.concat(matches) +def _drop_flips(df: pl.DataFrame): + return df.filter(pl.col('match_type').str.contains("flip").is_not()) + + def _parse_args(args=None): parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants') parser.add_argument('-d', '--dataset', dest='dataset', required=True, @@ -129,7 +137,7 @@ def _parse_args(args=None): 'statistics were used to construct the score.'), parser.add_argument('--keep_multiallelic', dest='remove_multiallelic', action='store_false', help='Flag to allow matching to multiallelic variants (default: false).') - parser.add_argument('--ignore_strand_flips', dest='consider_strand_flips', action='store_false', + parser.add_argument('--ignore_strand_flips', dest='skip_flip', action='store_true', help='Flag to not consider matched variants that may be reported on the opposite strand. ' 'Default behaviour is to flip/complement unmatched variants and check if they match.') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', From 1a814dc3651db4b7559a954e47a423c013c13c0b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 16 Aug 2022 16:03:51 +0100 Subject: [PATCH 29/47] fix matches, and be clearer about match keys --- pgscatalog_utils/match/match.py | 75 ++++++++++++++------------------- 1 file changed, 31 insertions(+), 44 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 779f431..b4990f2 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -50,47 +50,48 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap raise Exception -def _get_match_keys(strategy: str) -> tuple[list[str], list[str]]: - match strategy: +def _match_keys(): + return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', + 'accession', 'effect_type', 'effect_weight'] + + +def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str): + return scorefile.join(matches, on=_match_keys(), how='left').with_column(pl.lit(dataset).alias('dataset')) + + +def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame: + logger.debug(f"Matching strategy: {match_type}") + match match_type: case 'refalt': - return _scorefile_keys('effect_allele', 'other_allele'), _target_keys() + score_keys = ["chr_name", "chr_position", "effect_allele", "other_allele"] + target_keys = ["#CHROM", "POS", "REF", "ALT"] case 'altref': - return _scorefile_keys('other_allele', 'effect_allele'), _target_keys() + score_keys = ["chr_name", "chr_position", "effect_allele", "other_allele"] + target_keys = ["#CHROM", "POS", "ALT", "REF"] case 'refalt_flip': - return _scorefile_keys('effect_allele_FLIP', 'other_allele_FLIP'), _target_keys() + score_keys = ["chr_name", "chr_position", "effect_allele_FLIP", "other_allele_FLIP"] + target_keys = ["#CHROM", "POS", "REF", "ALT"] case 'altref_flip': - return _scorefile_keys('other_allele_FLIP', 'effect_allele_FLIP'), _target_keys() + score_keys = ["chr_name", "chr_position", "effect_allele_FLIP", "other_allele_FLIP"] + target_keys = ["#CHROM", "POS", "ALT", "REF"] case 'no_oa_ref': - return _scorefile_keys('effect_allele', ''), _target_keys(False) + score_keys = ["chr_name", "chr_position", "effect_allele"] + target_keys = ["#CHROM", "POS", "REF"] case 'no_oa_alt': - return _scorefile_keys('other_allele', ''), _target_keys(False) + score_keys = ["chr_name", "chr_position", "effect_allele"] + target_keys = ["#CHROM", "POS", "ALT"] case 'no_oa_ref_flip': - return _scorefile_keys('effect_allele_FLIP', ''), _target_keys(False) + score_keys = ["chr_name", "chr_position", "effect_allele_FLIP"] + target_keys = ["#CHROM", "POS", "REF"] case 'no_oa_alt_flip': - return _scorefile_keys('other_allele_FLIP', ''), _target_keys(False) + score_keys = ["chr_name", "chr_position", "effect_allele_FLIP"] + target_keys = ["#CHROM", "POS", "ALT"] case _: - logger.critical(f"Invalid match strategy: {strategy}") + logger.critical(f"Invalid match strategy: {match_type}") raise Exception - -def _match_keys(): - return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', - 'accession', 'effect_type', 'effect_weight'] - - -def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str): - return scorefile.join(matches, on=_match_keys(), how='left').with_column(pl.lit(dataset).alias('dataset')) - - -def _match_variants(scorefile: pl.DataFrame, - target: pl.DataFrame, - match_type: str) -> pl.DataFrame: - logger.debug(f"Matching strategy: {match_type}") - score_key, target_key = _get_match_keys(match_type) - return (scorefile.join(target, - left_on=score_key, - right_on=target_key, - how='inner').pipe(_post_match, target_key, match_type)) + return (scorefile.join(target, score_keys, target_keys, how='inner') + .pipe(_post_match, target_keys, match_type)) def _post_match(df: pl.DataFrame, @@ -133,20 +134,6 @@ def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: return scorefile, target -def _scorefile_keys(ref_key: str, alt_key: str) -> list[str]: - if alt_key: - return ['chr_name', 'chr_position', ref_key, alt_key] - else: - return ['chr_name', 'chr_position', ref_key] - - -def _target_keys(alt_key: bool = True) -> list[str]: - if alt_key: - return ['#CHROM', 'POS', "REF", "ALT"] - else: - return ['#CHROM', 'POS', "REF"] - - def _matched_colnames() -> list[str]: return ['chr_name', 'chr_position', 'effect_allele', 'effect_allele_FLIP', 'other_allele', 'other_allele_FLIP', 'effect_weight', 'effect_type', 'accession', 'ID', 'REF', 'ALT', 'match_type', 'is_multiallelic'] From 4e13ebb91c7214a5abad48a64ef74a2b32165e47 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 17 Aug 2022 10:42:37 +0100 Subject: [PATCH 30/47] fix matching with flipped alleles --- pgscatalog_utils/match/match.py | 40 ++++++++++----------------- pgscatalog_utils/match/postprocess.py | 6 ++-- pgscatalog_utils/match/write.py | 22 +++++++++------ 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index b4990f2..ad49f71 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -65,53 +65,45 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s case 'refalt': score_keys = ["chr_name", "chr_position", "effect_allele", "other_allele"] target_keys = ["#CHROM", "POS", "REF", "ALT"] + effect_allele_column = "effect_allele" case 'altref': score_keys = ["chr_name", "chr_position", "effect_allele", "other_allele"] target_keys = ["#CHROM", "POS", "ALT", "REF"] + effect_allele_column = "effect_allele" case 'refalt_flip': score_keys = ["chr_name", "chr_position", "effect_allele_FLIP", "other_allele_FLIP"] target_keys = ["#CHROM", "POS", "REF", "ALT"] + effect_allele_column = "effect_allele_FLIP" case 'altref_flip': score_keys = ["chr_name", "chr_position", "effect_allele_FLIP", "other_allele_FLIP"] target_keys = ["#CHROM", "POS", "ALT", "REF"] + effect_allele_column = "effect_allele_FLIP" case 'no_oa_ref': score_keys = ["chr_name", "chr_position", "effect_allele"] target_keys = ["#CHROM", "POS", "REF"] + effect_allele_column = "effect_allele" case 'no_oa_alt': score_keys = ["chr_name", "chr_position", "effect_allele"] target_keys = ["#CHROM", "POS", "ALT"] + effect_allele_column = "effect_allele" case 'no_oa_ref_flip': score_keys = ["chr_name", "chr_position", "effect_allele_FLIP"] target_keys = ["#CHROM", "POS", "REF"] + effect_allele_column = "effect_allele_FLIP" case 'no_oa_alt_flip': score_keys = ["chr_name", "chr_position", "effect_allele_FLIP"] target_keys = ["#CHROM", "POS", "ALT"] + effect_allele_column = "effect_allele_FLIP" case _: logger.critical(f"Invalid match strategy: {match_type}") raise Exception return (scorefile.join(target, score_keys, target_keys, how='inner') - .pipe(_post_match, target_keys, match_type)) - - -def _post_match(df: pl.DataFrame, - target_keys: list[str], - match_type: str) -> pl.DataFrame: - """ Annotate matches with parameters """ - if len(target_keys) == 3: - logger.debug("Dropping missing other_allele during annotation") - ref_key = target_keys[-1] - alt_key = 'dummy' # prevent trying to alias a column to None - else: - ref_key = target_keys[-2] - alt_key = target_keys[-1] - - # aliases keep a copy of columns dropped during the join - return df.with_columns([pl.col("*"), - pl.col("effect_allele").alias(ref_key), - pl.col("other_allele").alias(alt_key), - pl.lit(match_type).alias("match_type"), - ])[_matched_colnames()] + .with_columns([pl.col("*"), + pl.col(effect_allele_column).alias("matched_effect_allele"), + pl.lit(match_type).alias("match_type")]) + .join(target, on="ID", how="inner") # get REF / ALT back after first join + .drop(["#CHROM", "POS", "is_multiallelic_right"])) def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: @@ -127,13 +119,9 @@ def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: ]) if target: target = target.with_columns([ + pl.col("ID").cast(pl.Categorical), pl.col("REF").cast(pl.Categorical), pl.col("ALT").cast(pl.Categorical) ]) return scorefile, target - - -def _matched_colnames() -> list[str]: - return ['chr_name', 'chr_position', 'effect_allele', 'effect_allele_FLIP', 'other_allele', 'other_allele_FLIP', - 'effect_weight', 'effect_type', 'accession', 'ID', 'REF', 'ALT', 'match_type', 'is_multiallelic'] diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index db90499..14f24ca 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -23,7 +23,6 @@ def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFram def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: - # A / T or C / G may match multiple times df = df.with_columns([ pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), pl.lit(True).alias("ambiguous") @@ -44,7 +43,10 @@ def _get_distinct_weights(df: pl.DataFrame) -> pl.DataFrame: dups: pl.DataFrame = (count.filter(pl.col('count') > 1)[:, "accession":"effect_allele"] .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) - distinct: pl.DataFrame = pl.concat([singletons, _prioritise_match_type(dups)]) + if dups: + distinct: pl.DataFrame = pl.concat([singletons, _prioritise_match_type(dups)]) + else: + distinct: pl.DataFrame = singletons assert all((distinct.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count()['count']) == 1), \ "Duplicate effect weights for a variant" diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 50bbdba..f56029c 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -6,14 +6,17 @@ def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: - if os.path.isdir(outdir) is False: + if not os.path.isdir(outdir): os.mkdir(outdir) + logger.debug("Splitting by effect type") effect_types: dict[str, pl.DataFrame] = _split_effect_type(df) + logger.debug("Deduplicating variants") deduplicated: dict[str, pl.DataFrame] = {k: _deduplicate_variants(k, v) for k, v in effect_types.items()} - ea_dict: dict[str, str] = {'is_dominant': 'dominant', 'is_recessive': 'recessive', 'additive': 'additive'} + logger.debug("Writing out scorefiles") + ea_dict: dict[str, str] = {'is_dominant': 'dominant', 'is_recessive': 'recessive', 'additive': 'additive'} [_write_scorefile(ea_dict.get(k), v, split, outdir, dataset) for k, v in deduplicated.items()] @@ -47,12 +50,14 @@ def _format_scorefile(df: pl.DataFrame, split: bool) -> dict[str, pl.DataFrame]: logger.debug("Split output requested") chroms: list[int] = df["chr_name"].unique().to_list() return {x: (df.filter(pl.col("chr_name") == x) - .pivot(index=["ID", "effect_allele"], values="effect_weight", columns="accession") + .pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") + .rename({"matched_effect_allele": "effect_allele"}) .pipe(_fill_null)) for x in chroms} else: logger.debug("Split output not requested") - formatted: pl.DataFrame = (df.pivot(index=["ID", "effect_allele"], values="effect_weight", columns="accession") + formatted: pl.DataFrame = (df.pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") + .rename({"matched_effect_allele": "effect_allele"}) .pipe(_fill_null)) return {'false': formatted} @@ -112,9 +117,10 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra def _fill_null(df): # nulls are created when pivoting wider - if any(df.null_count() > 0): - logger.debug("Filling null weights with zero after pivoting wide") - return df.fill_null(0) - else: + nulls: pl.DataFrame = (df.null_count().filter(pl.col("*") > 0)) + if nulls.is_empty(): # is_empty() avoids weird truthiness problems logger.debug("No null weights detected") return df + else: + logger.debug("Filling null weights with zero after pivoting wide") + return df.fill_null(0) From a373f0a5faba46168783691175f6e375aac18b1c Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 17 Aug 2022 11:22:38 +0100 Subject: [PATCH 31/47] update labelling biallelic ambiguous variants --- pgscatalog_utils/match/postprocess.py | 7 +++++-- pgscatalog_utils/match/preprocess.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index 14f24ca..ef978b9 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -2,6 +2,8 @@ import polars as pl import logging +from pgscatalog_utils.match.preprocess import complement_valid_alleles + logger = logging.getLogger(__name__) @@ -23,13 +25,14 @@ def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFram def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: + logger.debug("Labelling ambiguous variants") df = df.with_columns([ pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), pl.lit(True).alias("ambiguous") - ]) + ]).pipe(complement_valid_alleles, ["REF"]) return (df.with_column( - pl.when((pl.col("effect_allele_FLIP") == pl.col("ALT")) | (pl.col("effect_allele_FLIP") == pl.col("REF"))) + pl.when(pl.col("REF_FLIP") == pl.col("ALT")) .then(pl.col("ambiguous")) .otherwise(False))).pipe(_get_distinct_weights) diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 0be878f..46fdc7f 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -8,7 +8,7 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF """ Improved function to complement alleles. Will only complement sequences that are valid DNA. """ for col in flip_cols: - logger.debug(f"Complementing scorefile column {col}") + logger.debug(f"Complementing column {col}") new_col = col + '_FLIP' df = df.with_column( pl.when(pl.col(col).str.contains('^[ACGT]+$')) From 00308560a7333e404e66a40e3dc9a3a4b7d5aae9 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 17 Aug 2022 11:40:57 +0100 Subject: [PATCH 32/47] update assert --- pgscatalog_utils/match/postprocess.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index ef978b9..300ef22 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -51,8 +51,7 @@ def _get_distinct_weights(df: pl.DataFrame) -> pl.DataFrame: else: distinct: pl.DataFrame = singletons - assert all((distinct.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count()['count']) == 1), \ - "Duplicate effect weights for a variant" + assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant" return distinct From 1bcc2d07faa29561d0abdf13bd9269bbc9491dc8 Mon Sep 17 00:00:00 2001 From: Sam Lambert Date: Wed, 17 Aug 2022 14:02:19 +0100 Subject: [PATCH 33/47] Edit description --- pgscatalog_utils/match/postprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index 300ef22..9d2b966 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -38,7 +38,7 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: def _get_distinct_weights(df: pl.DataFrame) -> pl.DataFrame: - """ Get a single effect weight for each matched variant per accession """ + """ Select single matched variant in target for each variant in the scoring file (e.g. per accession) """ count: pl.DataFrame = df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count() singletons: pl.DataFrame = (count.filter(pl.col('count') == 1)[:, "accession":"effect_allele"] .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) From d3e0f69d3bffd46f11721b10868e10ec213eb300 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 17 Aug 2022 14:24:36 +0100 Subject: [PATCH 34/47] Ignore strand flips by not looking for them (instead of dropping them post-hoc) --- pgscatalog_utils/match/match.py | 12 ++++--- pgscatalog_utils/match/match_variants.py | 43 ++++++++++-------------- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index ad49f71..65ad92f 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -7,7 +7,7 @@ logger = logging.getLogger(__name__) -def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame: +def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool, skip_flip: bool) -> pl.DataFrame: scorefile_cat, target_cat = _cast_categorical(scorefile, target) scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) @@ -18,15 +18,17 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu logger.debug("Getting matches for scores with effect allele and other allele") matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt")) matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref")) - matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip")) - matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip")) + if skip_flip is False: + matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip")) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip")) if scorefile_no_oa: logger.debug("Getting matches for scores with effect allele only") matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref")) matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt")) - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip")) - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip")) + if skip_flip is False: + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip")) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip")) return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index b98149d..a7b2f96 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -34,26 +34,22 @@ def match_variants(): match match_mode: case "single": logger.debug(f"Match mode: {match_mode}") - matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous) + matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous, args.skip_flip) case "multi": logger.debug(f"Match mode: {match_mode}") matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, - args.remove_ambiguous) + args.remove_ambiguous, args.skip_flip) case "fast": logger.debug(f"Match mode: {match_mode}") check_chrom: bool = False if n_target_files > 1: check_chrom = True matches = _fast_match(args.target, scorefile, args.remove_multiallelic, - args.remove_ambiguous, check_chrom) + args.remove_ambiguous, check_chrom, args.skip_flip) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception - if args.skip_flip: - logger.debug("Ignoring strand flip matches") - matches = _drop_flips(matches) - dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator check_match_rate(scorefile, matches, args.min_overlap, dataset) @@ -64,8 +60,16 @@ def match_variants(): write_out(matches, args.split, args.outdir, dataset) +def _check_target_chroms(target) -> None: + n_chrom: int = len(target['#CHROM'].unique().to_list()) + if n_chrom > 1: + logger.critical("Multiple chromosomes detected in split file. Check input data.") + raise Exception + else: + logger.debug("Split target genome contains one chromosome (good)") + def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool, check_chrom: bool) -> pl.DataFrame: + remove_ambiguous: bool, check_chrom: bool, skip_filp: bool) -> pl.DataFrame: # fast match is fast because: # 1) all target files are read into memory # 2) matching occurs without iterating through chromosomes @@ -73,47 +77,34 @@ def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: remove_multiallelic=remove_multiallelic) if check_chrom: _check_target_chroms(target) - return get_all_matches(scorefile, target, remove_ambiguous) + return get_all_matches(scorefile, target, remove_ambiguous, skip_filp) def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool) -> pl.DataFrame: + remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame: matches = [] for i, loc_target_current in enumerate(glob(target_path)): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.DataFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic) # _check_target_chroms(target) - matches.append(get_all_matches(scorefile, target, remove_ambiguous)) + matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp)) return pl.concat(matches) -def _check_target_chroms(target) -> None: - n_chrom: int = len(target['#CHROM'].unique().to_list()) - if n_chrom > 1: - logger.critical("Multiple chromosomes detected in split file. Check input data.") - raise Exception - else: - logger.debug("Split target genome contains one chromosome (good)") - - def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool) -> pl.DataFrame: + remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame: matches = [] for chrom in scorefile['chr_name'].unique().to_list(): target = read_target(target_path, remove_multiallelic=remove_multiallelic, single_file=True, chrom=chrom) # scans and filters if target: logger.debug(f"Matching chromosome {chrom}") - matches.append(get_all_matches(scorefile, target, remove_ambiguous)) + matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp)) return pl.concat(matches) -def _drop_flips(df: pl.DataFrame): - return df.filter(pl.col('match_type').str.contains("flip").is_not()) - - def _parse_args(args=None): parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants') parser.add_argument('-d', '--dataset', dest='dataset', required=True, From 95eaad79c3a644bb1cf1f739dbea970b49382ad3 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 17 Aug 2022 16:34:11 +0100 Subject: [PATCH 35/47] Ignore variants with null positional data (e.g. HLA alleles) that can't be matched --- pgscatalog_utils/match/preprocess.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 46fdc7f..3cc66f7 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -52,10 +52,10 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) def check_weights(df: pl.DataFrame) -> None: - weight_count = df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count()['count'] - - if any(weight_count > 1): - logger.error("Multiple effect weights per variant per accession detected") + """ Checks weights for scoring file variants that could be matched (e.g. have a chr & pos) """ + weight_count = df.filter(pl.col('chr_name').is_not_null() & pl.col('chr_position').is_not_null()).groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count() + if any(weight_count['count'] > 1): + logger.error("Multiple effect weights per variant per accession detected in files: {}".format(list(weight_count.filter(pl.col('count') > 1)['accession'].unique()))) raise Exception From 4d53823e4cda4166ca044b761ef6a7c0090c5511 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 17 Aug 2022 17:34:01 +0100 Subject: [PATCH 36/47] Backfill REF/ALT alleles in matches, and reorder columns for efficient pl.concat without diagonal method --- pgscatalog_utils/match/match.py | 43 +++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 65ad92f..7e758a2 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -13,22 +13,25 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) matches: list[pl.DataFrame] = [] + col_order = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession', 'effect_allele_FLIP', 'other_allele_FLIP', + 'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type'] + if scorefile_oa: logger.debug("Getting matches for scores with effect allele and other allele") - matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt")) - matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref")) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt")[col_order]) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref")[col_order]) if skip_flip is False: - matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip")) - matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip")) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip")[col_order]) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip")[col_order]) if scorefile_no_oa: logger.debug("Getting matches for scores with effect allele only") - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref")) - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt")) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref")[col_order]) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt")[col_order]) if skip_flip is False: - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip")) - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip")) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip")[col_order]) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip")[col_order]) return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) @@ -100,12 +103,26 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s logger.critical(f"Invalid match strategy: {match_type}") raise Exception + # first_join = scorefile.join(target, score_keys, target_keys, how='inner')\ + # .with_columns([pl.col("*"), + # pl.col(effect_allele_column).alias("matched_effect_allele"), + # pl.lit(match_type).alias("match_type")] + # ) + # + # print(match_type, first_join.columns) + + missing_cols = ['REF', 'ALT'] + if match_type.startswith('no_oa'): + if match_type.startswith('no_oa_ref'): + missing_cols = ['REF'] + else: + missing_cols = ['ALT'] + join_cols = ['ID'] + missing_cols return (scorefile.join(target, score_keys, target_keys, how='inner') - .with_columns([pl.col("*"), - pl.col(effect_allele_column).alias("matched_effect_allele"), - pl.lit(match_type).alias("match_type")]) - .join(target, on="ID", how="inner") # get REF / ALT back after first join - .drop(["#CHROM", "POS", "is_multiallelic_right"])) + .with_columns([pl.col("*"), + pl.col(effect_allele_column).alias("matched_effect_allele"), + pl.lit(match_type).alias("match_type")]) + .join(target[join_cols], on="ID", how="inner")) # get REF / ALT back after first join def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: From 8888a1f3fb9fc1e4f43461b5114429adc61158f7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 18 Aug 2022 14:14:46 +0100 Subject: [PATCH 37/47] .select() is faster than [] on large pl.DataFrames --- pgscatalog_utils/match/match.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 7e758a2..4646927 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -19,19 +19,19 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu if scorefile_oa: logger.debug("Getting matches for scores with effect allele and other allele") - matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt")[col_order]) - matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref")[col_order]) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt").select(col_order)) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref").select(col_order)) if skip_flip is False: - matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip")[col_order]) - matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip")[col_order]) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip").select(col_order)) + matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip").select(col_order)) if scorefile_no_oa: logger.debug("Getting matches for scores with effect allele only") - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref")[col_order]) - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt")[col_order]) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref").select(col_order)) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt").select(col_order)) if skip_flip is False: - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip")[col_order]) - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip")[col_order]) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order)) + matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order)) return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) @@ -103,14 +103,6 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s logger.critical(f"Invalid match strategy: {match_type}") raise Exception - # first_join = scorefile.join(target, score_keys, target_keys, how='inner')\ - # .with_columns([pl.col("*"), - # pl.col(effect_allele_column).alias("matched_effect_allele"), - # pl.lit(match_type).alias("match_type")] - # ) - # - # print(match_type, first_join.columns) - missing_cols = ['REF', 'ALT'] if match_type.startswith('no_oa'): if match_type.startswith('no_oa_ref'): @@ -122,7 +114,7 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s .with_columns([pl.col("*"), pl.col(effect_allele_column).alias("matched_effect_allele"), pl.lit(match_type).alias("match_type")]) - .join(target[join_cols], on="ID", how="inner")) # get REF / ALT back after first join + .join(target.select(join_cols), on="ID", how="inner")) # get REF / ALT back after first join def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: From eba42d04e941b0eebcf0f25bbde4e7c4a2b24d00 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 18 Aug 2022 14:14:59 +0100 Subject: [PATCH 38/47] fix handling matches without other allele --- pgscatalog_utils/match/postprocess.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index 9d2b966..33a0220 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -14,12 +14,8 @@ def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFram return df.filter(pl.col("ambiguous") == False) else: logger.debug("Keeping best possible match from ambiguous matches") - # pick the best possible match from the ambiguous matches - # EA = REF and OA = ALT or EA = REF and OA = None ambiguous: pl.DataFrame = df.filter((pl.col("ambiguous") == True) & \ - (pl.col("match_type") == "refalt") | - (pl.col("ambiguous") == True) & \ - (pl.col("match_type") == "no_oa_ref")) + (pl.col("match_type").str.contains('flip').is_not())) unambiguous: pl.DataFrame = df.filter(pl.col("ambiguous") == False) return pl.concat([ambiguous, unambiguous]) From 4071a88b56f26a83192eaf63913238fa22e5ea69 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 18 Aug 2022 14:15:12 +0100 Subject: [PATCH 39/47] add test cases --- tests/test_match.py | 92 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/tests/test_match.py b/tests/test_match.py index 596461a..24956f7 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -1,9 +1,11 @@ import os from unittest.mock import patch -import pandas as pd +import polars as pl import pytest +from pgscatalog_utils.match.match import get_all_matches, _cast_categorical from pgscatalog_utils.match.match_variants import match_variants +from pgscatalog_utils.match.preprocess import complement_valid_alleles def test_match_fail(combined_scorefile, target_path, tmp_path): @@ -34,3 +36,91 @@ def test_match_pass(mini_scorefile, target_path, tmp_path): with patch('sys.argv', args): match_variants() + +def _cast_cat(scorefile, target): + with pl.StringCache(): + return _cast_categorical(scorefile, target) + + +def test_match_strategies(small_scorefile, small_target): + scorefile, target = _cast_cat(small_scorefile, small_target) + + # check unambiguous matches + df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True) + assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'}) + assert set(df['match_type'].to_list()).issubset(['altref', 'refalt']) + + # when keeping ambiguous and flipping alleles: + # 2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip' + # flipped matches should be dropped for ambiguous matches + flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False)\ + .filter(pl.col('ambiguous') == True)) + assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) + assert set(flip['match_type'].to_list()).issubset({'altref'}) + + +def test_no_oa_match(small_scorefile_no_oa, small_target): + scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) + + df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True) + assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) + assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref']) + + # one of the matches is ambiguous + flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False) + .filter(pl.col('ambiguous') == True)) + assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) + assert set(flip['match_type'].to_list()).issubset({'no_oa_alt'}) + + +def test_flip_match(small_flipped_scorefile, small_target): + scorefile, target = _cast_cat(small_flipped_scorefile, small_target) + + df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True) + assert df.is_empty() + + flip = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=False) + assert flip['match_type'].str.contains('flip').all() + assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) + + flip_ambig = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False) + .filter(pl.col('ambiguous') == True)) + assert not flip_ambig['match_type'].str.contains('flip').any() # no flip matches for ambiguous + + +@pytest.fixture +def small_scorefile(): + df = pl.DataFrame({"accession": ["test", "test", "test"], + "chr_name": [1, 2, 3], + "chr_position": [1, 2, 3], + "effect_allele": ["A", "A", "G"], + "other_allele": ["C", "T", "T"], + "effect_weight": [1, 2, 3], + "effect_type": ["additive", "additive", "additive"]}) + + return complement_valid_alleles(df, ["effect_allele", "other_allele"]) + + +@pytest.fixture +def small_scorefile_no_oa(small_scorefile): + return small_scorefile.with_column(pl.lit(None).alias('other_allele')) + + +@pytest.fixture +def small_flipped_scorefile(small_scorefile): + # simulate a scorefile on the wrong strand + return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele']) + .drop(['effect_allele', 'other_allele']) + .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'}) + .pipe(complement_valid_alleles, ['effect_allele', 'other_allele'])) + + +@pytest.fixture +def small_target(): + return pl.DataFrame({"#CHROM": [1, 2, 3], + "POS": [1, 2, 3], + "REF": ["A", "T", "T"], + "ALT": ["C", "A", "G"], + "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"], + "is_multiallelic": [False, False, False]}) + From dde95a7d9cf3c4d65e91220d0a2d1434cfe09be5 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 18 Aug 2022 14:40:08 +0100 Subject: [PATCH 40/47] skip scoring files that fail matching, unless no files match --- pgscatalog_utils/match/match.py | 38 ++++++++++++++++-------- pgscatalog_utils/match/match_variants.py | 7 +++-- tests/test_match.py | 2 +- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 4646927..6a3f70c 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -1,21 +1,23 @@ -import polars as pl import logging +import polars as pl + from pgscatalog_utils.match.postprocess import postprocess_matches from pgscatalog_utils.match.write import write_log logger = logging.getLogger(__name__) -def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool, skip_flip: bool) -> pl.DataFrame: +def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool, + skip_flip: bool) -> pl.DataFrame: scorefile_cat, target_cat = _cast_categorical(scorefile, target) scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) matches: list[pl.DataFrame] = [] - col_order = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession', 'effect_allele_FLIP', 'other_allele_FLIP', - 'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type'] - + col_order = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', + 'accession', 'effect_allele_FLIP', 'other_allele_FLIP', + 'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type'] if scorefile_oa: logger.debug("Getting matches for scores with effect allele and other allele") @@ -36,23 +38,33 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) -def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> None: +def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> pl.DataFrame: scorefile: pl.DataFrame = scorefile.with_columns([ pl.col('effect_type').cast(pl.Categorical), pl.col('accession').cast(pl.Categorical)]) # same dtypes for join match_log: pl.DataFrame = _join_matches(matches, scorefile, dataset) - write_log(match_log, dataset) fail_rates: pl.DataFrame = (match_log.groupby('accession') .agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')]) .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate')) ) - + pass_df: pl.DataFrame = pl.DataFrame() for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()): if rate < (1 - min_overlap): + df = pl.DataFrame({'accession': [accession], 'match_pass': [True], 'match_rate': [1 - rate]}) + pass_df = pl.concat([pass_df, df]) logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%} variants match)") else: + df = pl.DataFrame({'accession': [accession], 'match_pass': [False], 'match_rate': [1 - rate]}) + pass_df = pl.concat([pass_df, df]) logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)") - raise Exception + + # add match statistics to log and matches + write_log((match_log.with_column(pl.col('accession').cast(str)) + .join(pass_df, on='accession', how='left')), dataset) + + return (matches.with_column(pl.col('accession').cast(str)) + .join(pass_df, on='accession', how='left')) + def _match_keys(): @@ -111,10 +123,10 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s missing_cols = ['ALT'] join_cols = ['ID'] + missing_cols return (scorefile.join(target, score_keys, target_keys, how='inner') - .with_columns([pl.col("*"), - pl.col(effect_allele_column).alias("matched_effect_allele"), - pl.lit(match_type).alias("match_type")]) - .join(target.select(join_cols), on="ID", how="inner")) # get REF / ALT back after first join + .with_columns([pl.col("*"), + pl.col(effect_allele_column).alias("matched_effect_allele"), + pl.lit(match_type).alias("match_type")]) + .join(target.select(join_cols), on="ID", how="inner")) # get REF / ALT back after first join def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index a7b2f96..0333807 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -51,13 +51,14 @@ def match_variants(): raise Exception dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator - check_match_rate(scorefile, matches, args.min_overlap, dataset) + valid_matches: pl.DataFrame = (check_match_rate(scorefile, matches, args.min_overlap, dataset) + .filter(pl.col('match_pass') == True)) - if matches.shape[0] == 0: # this can happen if args.min_overlap = 0 + if valid_matches.is_empty(): # this can happen if args.min_overlap = 0 logger.error("Error: no target variants match any variants in scoring files") raise Exception - write_out(matches, args.split, args.outdir, dataset) + write_out(valid_matches, args.split, args.outdir, dataset) def _check_target_chroms(target) -> None: diff --git a/tests/test_match.py b/tests/test_match.py index 24956f7..6f3394d 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -13,7 +13,7 @@ def test_match_fail(combined_scorefile, target_path, tmp_path): args: list[str] = ['match_variants', '-s', combined_scorefile, '-t', target_path, - '-m', 0, + '-m', 1, '-d', 'test', '--outdir', out_dir, '--keep_ambiguous', '--keep_multiallelic'] From cf3a3d61229be243f9688efc6763a0ae9a73bb2a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 19 Aug 2022 11:04:46 +0100 Subject: [PATCH 41/47] don't check target chroms in fast match mode --- pgscatalog_utils/match/match_variants.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 0333807..73145c1 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -41,11 +41,8 @@ def match_variants(): args.remove_ambiguous, args.skip_flip) case "fast": logger.debug(f"Match mode: {match_mode}") - check_chrom: bool = False - if n_target_files > 1: - check_chrom = True matches = _fast_match(args.target, scorefile, args.remove_multiallelic, - args.remove_ambiguous, check_chrom, args.skip_flip) + args.remove_ambiguous, args.skip_flip) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception @@ -62,22 +59,22 @@ def match_variants(): def _check_target_chroms(target) -> None: - n_chrom: int = len(target['#CHROM'].unique().to_list()) - if n_chrom > 1: - logger.critical("Multiple chromosomes detected in split file. Check input data.") + chroms: list[str] = target['#CHROM'].unique().to_list() + if len(chroms) > 1: + logger.critical(f"Multiple chromosomes detected: {chroms}. Check input data.") raise Exception else: logger.debug("Split target genome contains one chromosome (good)") + def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool, check_chrom: bool, skip_filp: bool) -> pl.DataFrame: + remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame: # fast match is fast because: # 1) all target files are read into memory # 2) matching occurs without iterating through chromosomes target: pl.DataFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic) - if check_chrom: - _check_target_chroms(target) + logger.debug("Split target chromosomes not checked with fast match mode") return get_all_matches(scorefile, target, remove_ambiguous, skip_filp) From 3ea552f341c8d5489efe31fc57dfc8515f5d28c0 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 19 Aug 2022 11:04:57 +0100 Subject: [PATCH 42/47] fix fill_null --- pgscatalog_utils/match/write.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index f56029c..0c7b169 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -117,10 +117,4 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra def _fill_null(df): # nulls are created when pivoting wider - nulls: pl.DataFrame = (df.null_count().filter(pl.col("*") > 0)) - if nulls.is_empty(): # is_empty() avoids weird truthiness problems - logger.debug("No null weights detected") - return df - else: - logger.debug("Filling null weights with zero after pivoting wide") - return df.fill_null(0) + return df.fill_null(strategy="zero") From 3662ed8b2ee29ebdc587c9fc34f4c89a337ce9bb Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 19 Aug 2022 12:18:46 +0100 Subject: [PATCH 43/47] remove redundant function --- pgscatalog_utils/match/write.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 0c7b169..110e308 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -52,13 +52,13 @@ def _format_scorefile(df: pl.DataFrame, split: bool) -> dict[str, pl.DataFrame]: return {x: (df.filter(pl.col("chr_name") == x) .pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") .rename({"matched_effect_allele": "effect_allele"}) - .pipe(_fill_null)) + .fill_null(strategy="zero")) for x in chroms} else: logger.debug("Split output not requested") formatted: pl.DataFrame = (df.pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") .rename({"matched_effect_allele": "effect_allele"}) - .pipe(_fill_null)) + .fill_null(strategy="zero")) return {'false': formatted} @@ -113,8 +113,3 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra assert n_var == df.shape[0] return df_lst - - -def _fill_null(df): - # nulls are created when pivoting wider - return df.fill_null(strategy="zero") From 64289e575ced0caddbd646902d8c124ecc5a4ccd Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 22 Aug 2022 16:48:28 +0100 Subject: [PATCH 44/47] bump version --- pgscatalog_utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py index b794fd4..df9144c 100644 --- a/pgscatalog_utils/__init__.py +++ b/pgscatalog_utils/__init__.py @@ -1 +1 @@ -__version__ = '0.1.0' +__version__ = '0.1.1' From 309fcf54bcab583737078751cd1c19eada3cbbba Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 23 Aug 2022 11:17:49 +0100 Subject: [PATCH 45/47] Citation edits --- README.md | 24 +++++++++++++++++++++++- pyproject.toml | 4 ++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5cfbd02..9790c46 100644 --- a/README.md +++ b/README.md @@ -41,4 +41,26 @@ $ cd pgscatalog_utils $ poetry install $ poetry build $ pip install --user dist/*.whl -``` \ No newline at end of file +``` + +## Credits +The `pgscatalog_utils` package is developed as part of the **Polygenic Score (PGS) Catalog** +([www.PGSCatalog.org](https://www.PGSCatalog.org)) project, a collaboration between the +University of Cambridge’s Department of Public Health and Primary Care (Michael Inouye, Samuel Lambert, Laurent Gil) +and the European Bioinformatics Institute (Helen Parkinson, Aoife McMahon, Ben Wingfield, Laura Harris). + +A manuscript describing the tool and larger PGS Catalog Calculator pipeline +[(`PGSCatalog/pgsc_calc`)](https://github.com/PGScatalog/pgsc_calc) is in preparation. In the meantime +if you use these tools we ask you to cite the repo(s) and the paper describing the PGS Catalog resource: + +- >PGS Catalog utilities _(in development)_. PGS Catalog + Team. [https://github.com/PGScatalog/pgscatalog_utils](https://github.com/PGScatalog/pgscatalog_utils) +- >PGS Catalog Calculator _(in development)_. PGS Catalog + Team. [https://github.com/PGScatalog/pgsc_calc](https://github.com/PGScatalog/pgsc_calc) +- >Lambert _et al._ (2021) The Polygenic Score Catalog as an open database for +reproducibility and systematic evaluation. Nature Genetics. 53:420–425 +doi:[10.1038/s41588-021-00783-5](https://doi.org/10.1038/s41588-021-00783-5). + +This work has received funding from EMBL-EBI core funds, the Baker Institute, the University of Cambridge, +Health Data Research UK (HDRUK), and the European Union's Horizon 2020 research and innovation programme +under grant agreement No 101016775 INTERVENE. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b3968f7..44ef233 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [tool.poetry] name = "pgscatalog_utils" version = "0.1.1" -description = "Some useful utilities for working with PGS Catalog data" +description = "Utilities for working with PGS Catalog API and scoring files" homepage = "https://github.com/PGScatalog/pgscatalog_utils" -authors = ["Benjamin Wingfield "] +authors = ["Benjamin Wingfield ", "Samuel Lambert "] license = "Apache-2.0" readme = "README.md" From c8ef7c9be434ddbc241c6047725adbfd402efd5e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 23 Aug 2022 15:10:20 +0100 Subject: [PATCH 46/47] update descriptions in args --- README.md | 3 + .../download/download_scorefile.py | 64 ++++++++++++----- pgscatalog_utils/match/match_variants.py | 70 +++++++++++++++--- .../scorefile/combine_scorefiles.py | 72 ++++++++++++------- 4 files changed, 155 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 9790c46..d19c186 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,8 @@ $ combine_scorefiles -s PGS*.txt.gz -o combined.txt $ match_variants -s combined.txt -t --min_overlap 0.75 --outdir . ``` +More details are available using the `--help` parameter. + ## Install from source Requirements: @@ -44,6 +46,7 @@ $ pip install --user dist/*.whl ``` ## Credits + The `pgscatalog_utils` package is developed as part of the **Polygenic Score (PGS) Catalog** ([www.PGSCatalog.org](https://www.PGSCatalog.org)) project, a collaboration between the University of Cambridge’s Department of Public Health and Primary Care (Michael Inouye, Samuel Lambert, Laurent Gil) diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index c51aaf0..0bd3298 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -1,11 +1,11 @@ -import logging import argparse +import logging import os import shutil +import textwrap from contextlib import closing from functools import reduce from urllib import request as request -import sys from pgscatalog_utils.download.publication import query_publication from pgscatalog_utils.download.score import get_url @@ -15,24 +15,8 @@ logger = logging.getLogger(__name__) -def parse_args(args=None) -> argparse.Namespace: - parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files from the PGS Catalog') - parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)') - parser.add_argument('-t', '--efo', dest='efo', nargs='+', - help='Traits described by an EFO term(s) (e.g. EFO_0004611)') - parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+') - parser.add_argument('-b', '--build', dest='build', - help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38') - parser.add_argument('-o', '--outdir', dest='outdir', required=True, - default='scores/', - help=' Output directory to store downloaded files') - parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help='Extra logging information') - return parser.parse_args(args) - - def download_scorefile() -> None: - args = parse_args() + args = _parse_args() set_logging_level(args.verbose) _check_args(args) _mkdir(args.outdir) @@ -96,5 +80,47 @@ def _check_args(args): raise Exception +def _description_text() -> str: + return textwrap.dedent('''\ + Download a set of scoring files from the PGS Catalog using PGS + Scoring IDs, traits, or publication IDs. + + The PGS Catalog API is queried to get a list of scoring file + URLs. Scoring files are downloaded via FTP to a specified + directory. PGS Catalog scoring files are staged with the name: + + {PGS_ID}_hmPOS_{genome_build}.txt.gz + + These harmonised scoring files contain genomic coordinates, + remapped from author-submitted information such as rsids. + ''') + + +def _epilog_text() -> str: + return textwrap.dedent('''\ + download_scorefiles will skip downloading a scoring file if it + already exists in the download directory. This can be useful if + the download process is interrupted and needs to be restarted + later. You can track download progress with the verbose flag. + ''') + + +def _parse_args(args=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(), + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)') + parser.add_argument('-t', '--efo', dest='efo', nargs='+', + help='Traits described by an EFO term(s) (e.g. EFO_0004611)') + parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+') + parser.add_argument('-b', '--build', dest='build', + help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38') + parser.add_argument('-o', '--outdir', dest='outdir', required=True, + default='scores/', + help=' Output directory to store downloaded files') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help=' Extra logging information') + return parser.parse_args(args) + + if __name__ == "__main__": download_scorefile() diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 73145c1..55d7a56 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,5 +1,6 @@ import argparse import logging +import textwrap from glob import glob import polars as pl @@ -103,10 +104,58 @@ def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multi return pl.concat(matches) +def _description_text() -> str: + return textwrap.dedent('''\ + Match variants from a combined scoring file against a set of + target genomes from the same fileset, and output scoring files + compatible with the plink2 --score function. + + A combined scoring file is the output of the combine_scorefiles + script. It has the following structure: + + | chr_name | chr_position | ... | accession | + | -------- | ------------ | --- | --------- | + | 1 | 1 | ... | PGS000802 | + + The combined scoring file is in long format, with one row per + variant for each scoring file (accession). This structure is + different to the PGS Catalog standard, because the long format + makes matching faster and simpler. + + Target genomes can be in plink1 bim format or plink2 pvar + format. Variant IDs should be unique. + + Only one set of target genomes should be matched at a time. Don't + try to match target genomes from different plink + filesets. Matching against a set of chromosomes from the same + fileset is OK (see --split). + ''') + + +def _epilog_text() -> str: + return textwrap.dedent('''\ + match_variants will output at least one scoring file in a + format compatible with the plink2 --score function. This + output might be split across different files to ensure each + variant ID, effect allele, and effect type appears only once + in each file. Output files have the pattern: + + {dataset}_{chromosome}_{effect_type}_{n}.scorefile. + + If multiple chromosomes are combined into a single file (i.e. not + --split), then {chromosome} is replaced with 'ALL'. Once the + scorefiles are used to calculate a score with plink2, the .sscore + files will need to be aggregated to calculate a single polygenic + score for each dataset, sample, and accession (scoring file). The + PGS Catalog Calculator does this automatically. + ''') + + def _parse_args(args=None): - parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants') + parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(), + formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-d', '--dataset', dest='dataset', required=True, - help=' Label for target genomic dataset (e.g. "-d thousand_genomes")') + help=' Label for target genomic dataset') parser.add_argument('-s', '--scorefiles', dest='scorefile', required=True, help=' Combined scorefile path (output of read_scorefiles.py)') parser.add_argument('-t', '--target', dest='target', required=True, @@ -120,15 +169,18 @@ def _parse_args(args=None): parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True, type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false', - help='Flag to force the program to keep variants with ambiguous alleles, (e.g. A/T and G/C ' - 'SNPs), which are normally excluded (default: false). In this case the program proceeds ' - 'assuming that the genotype data is on the same strand as the GWAS whose summary ' - 'statistics were used to construct the score.'), + help=''' Flag to force the program to keep variants with + ambiguous alleles, (e.g. A/T and G/C SNPs), which are normally + excluded (default: false). In this case the program proceeds + assuming that the genotype data is on the same strand as the + GWAS whose summary statistics were used to construct the score. + ''') parser.add_argument('--keep_multiallelic', dest='remove_multiallelic', action='store_false', - help='Flag to allow matching to multiallelic variants (default: false).') + help=' Flag to allow matching to multiallelic variants (default: false).') parser.add_argument('--ignore_strand_flips', dest='skip_flip', action='store_true', - help='Flag to not consider matched variants that may be reported on the opposite strand. ' - 'Default behaviour is to flip/complement unmatched variants and check if they match.') + help=''' Flag to not consider matched variants that may be reported + on the opposite strand. Default behaviour is to flip/complement unmatched variants and check if + they match.''') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') return parser.parse_args(args) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 02d0ca9..c9e80f4 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -1,6 +1,8 @@ import argparse -import sys import logging +import sys +import textwrap + import pandas as pd from pgscatalog_utils.log_config import set_logging_level @@ -11,32 +13,8 @@ from pgscatalog_utils.scorefile.write import write_scorefile -def parse_args(args=None) -> argparse.Namespace: - parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Combine multiple scoring files') - parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+', - help=' Scorefile path (wildcard * is OK)', required=True) - parser.add_argument('--liftover', dest='liftover', - help=' Convert scoring file variants to target genome build?', action='store_true') - parser.add_argument('-t', '--target_build', dest='target_build', help='Build of target genome ', - required='--liftover' in sys.argv) - parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files', - required="--liftover" in sys.argv) - parser.add_argument('-m', '--min_lift', dest='min_lift', - help='If liftover, minimum proportion of variants lifted over', - required="--liftover" in sys.argv, default=0.95, type=float) - parser.add_argument('--drop_missing', dest='drop_missing', action='store_true', - help='Drop variants with missing information (chr/pos) and ' - 'non-standard alleles from the output file.') - parser.add_argument('-o', '--outfile', dest='outfile', required=True, - default='combined.txt', - help=' Output path to combined long scorefile') - parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help=' Extra logging information') - return parser.parse_args(args) - - def combine_scorefiles(): - args = parse_args() + args = _parse_args() logger = logging.getLogger(__name__) set_logging_level(args.verbose) @@ -61,3 +39,45 @@ def _read_and_melt(path, drop_missing: bool = False): if __name__ == "__main__": combine_scorefiles() + + +def _description_text() -> str: + return textwrap.dedent('''\ + Combine multiple scoring files in PGS Catalog format to a 'long' + table, and optionally liftover genomic coordinates to GRCh37 or + GRCh38. Custom scorefiles in PGS Catalog format can be combined + with PGS Catalog scoring files. The program can accept a mix of + unharmonised and harmonised PGS Catalog data. + ''') + + +def _epilog_text() -> str: + return textwrap.dedent('''\ + The long table is used to simplify intersecting variants in target + genomes and the scoring files with the match_variants program. + ''') + + +def _parse_args(args=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(), + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+', + help=' Scorefile path (wildcard * is OK)', required=True) + parser.add_argument('--liftover', dest='liftover', + help=' Convert scoring file variants to target genome build?', action='store_true') + parser.add_argument('-t', '--target_build', dest='target_build', help='Build of target genome ', + required='--liftover' in sys.argv) + parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files', + required="--liftover" in sys.argv) + parser.add_argument('-m', '--min_lift', dest='min_lift', + help='If liftover, minimum proportion of variants lifted over', + required="--liftover" in sys.argv, default=0.95, type=float) + parser.add_argument('--drop_missing', dest='drop_missing', action='store_true', + help='Drop variants with missing information (chr/pos) and ' + 'non-standard alleles from the output file.') + parser.add_argument('-o', '--outfile', dest='outfile', required=True, + default='combined.txt', + help=' Output path to combined long scorefile') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help=' Extra logging information') + return parser.parse_args(args) From 50999acb179e5df8239e7b0b8097439e4a249e18 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 23 Aug 2022 16:53:01 +0100 Subject: [PATCH 47/47] Minor description edits --- pgscatalog_utils/download/download_scorefile.py | 4 ++++ pgscatalog_utils/match/match_variants.py | 8 ++++---- pgscatalog_utils/scorefile/combine_scorefiles.py | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index 0bd3298..30f8ac8 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -88,6 +88,10 @@ def _description_text() -> str: The PGS Catalog API is queried to get a list of scoring file URLs. Scoring files are downloaded via FTP to a specified directory. PGS Catalog scoring files are staged with the name: + + {PGS_ID}.txt.gz + + If a valid build is specified harmonized files are downloaded as: {PGS_ID}_hmPOS_{genome_build}.txt.gz diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 55d7a56..0d31da6 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -123,12 +123,12 @@ def _description_text() -> str: makes matching faster and simpler. Target genomes can be in plink1 bim format or plink2 pvar - format. Variant IDs should be unique. + format. Variant IDs should be unique so that they can be specified + in the scoring file as: variant_id|effect_allele|[effect_weight column(s)...] Only one set of target genomes should be matched at a time. Don't - try to match target genomes from different plink - filesets. Matching against a set of chromosomes from the same - fileset is OK (see --split). + try to match target genomes from different plink filesets. Matching + against a set of chromosomes from the same fileset is OK (see --split). ''') diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index c9e80f4..35d9b85 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -43,7 +43,8 @@ def _read_and_melt(path, drop_missing: bool = False): def _description_text() -> str: return textwrap.dedent('''\ - Combine multiple scoring files in PGS Catalog format to a 'long' + Combine multiple scoring files in PGS Catalog format (see + https://www.pgscatalog.org/downloads/ for details) to a 'long' table, and optionally liftover genomic coordinates to GRCh37 or GRCh38. Custom scorefiles in PGS Catalog format can be combined with PGS Catalog scoring files. The program can accept a mix of