From e038c8d2ced33ebcbc89a90d29857fc60b1453d9 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Thu, 25 Aug 2022 12:27:37 +0100 Subject: [PATCH 01/59] Revised behaviour of combine_scorefiles to not crash when it encounters a duplicated variant position. --- .../scorefile/combine_scorefiles.py | 18 ++++++++++-------- pgscatalog_utils/scorefile/qc.py | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 35d9b85..ef20102 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -43,19 +43,21 @@ def _read_and_melt(path, drop_missing: bool = False): def _description_text() -> str: return textwrap.dedent('''\ - Combine multiple scoring files in PGS Catalog format (see - https://www.pgscatalog.org/downloads/ for details) to a 'long' - table, and optionally liftover genomic coordinates to GRCh37 or - GRCh38. Custom scorefiles in PGS Catalog format can be combined - with PGS Catalog scoring files. The program can accept a mix of - unharmonised and harmonised PGS Catalog data. + Combine multiple scoring files in PGS Catalog format (see https://www.pgscatalog.org/downloads/ + for details) to a 'long' table of columns needed for variant matching and subsequent calculation. + + Custom scorefiles in PGS Catalog format can be combined with PGS Catalog scoring files, and + optionally liftover genomic coordinates to GRCh37 or GRCh38. The script can accept a mix of + unharmonised and harmonised PGS Catalog data. By default all variants are output (including + positions with duplicated data [often caused by rsID/liftover collions across builds]) and + variants with missing positions. ''') def _epilog_text() -> str: return textwrap.dedent('''\ - The long table is used to simplify intersecting variants in target - genomes and the scoring files with the match_variants program. + The long table is used to simplify intersecting variants in target genotyping datasets + and the scoring files with the match_variants program. ''') diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 4316f1e..9b8c98e 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -71,7 +71,7 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: if unique.all(): return df else: - raise Exception("Duplicate variants in scoring file") + logger.warning("Duplicate variants in scoring file.") def _check_shape(df: pd.DataFrame) -> None: From 73154ec48b4debc9a2c662859f586533cd0b983e Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Thu, 25 Aug 2022 12:50:44 +0100 Subject: [PATCH 02/59] Needs to return df --- pgscatalog_utils/scorefile/qc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 9b8c98e..ef3cc8a 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -72,6 +72,7 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: return df else: logger.warning("Duplicate variants in scoring file.") + return df def _check_shape(df: pd.DataFrame) -> None: From a4dabb3575b14c74c472e77baf6522ad348c5ca2 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 26 Aug 2022 12:14:55 +0100 Subject: [PATCH 03/59] Flag duplicated variants within scorefiles (output is_duplicated as bool) --- pgscatalog_utils/scorefile/combine_scorefiles.py | 2 +- pgscatalog_utils/scorefile/qc.py | 11 ++++++----- pgscatalog_utils/scorefile/write.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index ef20102..810ff9b 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -77,7 +77,7 @@ def _parse_args(args=None) -> argparse.Namespace: required="--liftover" in sys.argv, default=0.95, type=float) parser.add_argument('--drop_missing', dest='drop_missing', action='store_true', help='Drop variants with missing information (chr/pos) and ' - 'non-standard alleles from the output file.') + 'non-standard alleles (e.g. HLA=P/N) from the output file.') parser.add_argument('-o', '--outfile', dest='outfile', required=True, default='combined.txt', help=' Output path to combined long scorefile') diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index ef3cc8a..bd38991 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -66,14 +66,15 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: logger.warning("Other allele column not detected, dropping other_allele from variant identifier.") group_cols = ['chr_name', 'chr_position', 'effect_allele'] - unique: pd.Series = df.groupby(group_cols).size() == 1 + u_count: pd.Series = df.groupby(group_cols).size() - if unique.all(): - return df + if all(u_count == 1): + return df.assign(is_duplicated=False) else: logger.warning("Duplicate variants in scoring file.") - return df - + u_count = u_count > 1 + u_count.name = 'is_duplicated' + return pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True) def _check_shape(df: pd.DataFrame) -> None: assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)" diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 9204096..1f22197 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -7,7 +7,7 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None: cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', - 'accession'] + 'is_duplicated', 'accession'] if df.empty: logger.error("Empty scorefile output! Please check the input data") From 7051f9dca098d596993a8886ef4a0998e6f7b848 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 26 Aug 2022 12:21:11 +0100 Subject: [PATCH 04/59] Handle null variants, fix test --- pgscatalog_utils/scorefile/qc.py | 4 +++- tests/test_combine.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index bd38991..0e96368 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -74,7 +74,9 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: logger.warning("Duplicate variants in scoring file.") u_count = u_count > 1 u_count.name = 'is_duplicated' - return pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True) + df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True) + df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos + return df def _check_shape(df: pd.DataFrame) -> None: assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)" diff --git a/tests/test_combine.py b/tests/test_combine.py index 6243cef..f9ee7a1 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -7,7 +7,8 @@ def test_combine_scorefiles(combined_scorefile, _n_variants): df = pd.read_table(combined_scorefile) - cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession'} + cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', + 'is_duplicated', 'accession'} assert set(df.columns).issubset(cols) assert df.shape[0] == _n_variants From 912d1bd0e1ead87561e9fbef1d17be673e4bf4d1 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 26 Aug 2022 12:38:23 +0100 Subject: [PATCH 05/59] Implement gzipped output if filename endswith '.gz' --- pgscatalog_utils/scorefile/combine_scorefiles.py | 1 + pgscatalog_utils/scorefile/write.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 810ff9b..39bad2b 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -79,6 +79,7 @@ def _parse_args(args=None) -> argparse.Namespace: help='Drop variants with missing information (chr/pos) and ' 'non-standard alleles (e.g. HLA=P/N) from the output file.') parser.add_argument('-o', '--outfile', dest='outfile', required=True, + help='[ will compress output if filename ends with .gz ]', default='combined.txt', help=' Output path to combined long scorefile') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 1f22197..f9762b1 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -13,7 +13,6 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None: logger.error("Empty scorefile output! Please check the input data") raise Exception else: - logger.debug("Writing out combined scorefile") out_df: pd.DataFrame = (df.drop('accession', axis=1) .rename({'filename_prefix': 'accession'}, axis=1) .pipe(_filter_failed_liftover)) @@ -21,8 +20,12 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None: if 'other_allele' not in out_df: logger.warning("No other allele information detected, writing out as missing data") out_df['other_allele'] = None - - out_df[cols].to_csv(path, index=False, sep="\t") + if path.endswith('.gz'): + logger.debug("Writing out gzip-compressed combined scorefile") + out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip') + else: + logger.debug("Writing out combined scorefile") + out_df[cols].to_csv(path, index=False, sep="\t") def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame: From ec5577f91b62547f44b9194ea549e6ab51a8e1fe Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 26 Aug 2022 12:41:33 +0100 Subject: [PATCH 06/59] Typo --- pgscatalog_utils/scorefile/combine_scorefiles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 39bad2b..925ee27 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -79,9 +79,9 @@ def _parse_args(args=None) -> argparse.Namespace: help='Drop variants with missing information (chr/pos) and ' 'non-standard alleles (e.g. HLA=P/N) from the output file.') parser.add_argument('-o', '--outfile', dest='outfile', required=True, - help='[ will compress output if filename ends with .gz ]', default='combined.txt', - help=' Output path to combined long scorefile') + help=' Output path to combined long scorefile ' + '[ will compress output if filename ends with .gz ]') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') return parser.parse_args(args) From eee720ef3bba5031b1127469d45ad4c5f5e90fae Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 26 Aug 2022 12:53:12 +0100 Subject: [PATCH 07/59] Scoring-file specific warning --- pgscatalog_utils/scorefile/qc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 0e96368..ff5c942 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -71,7 +71,7 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: if all(u_count == 1): return df.assign(is_duplicated=False) else: - logger.warning("Duplicate variants in scoring file.") + logger.warning("Duplicate variants in scoring file: {}".format(df['filename_prefix'].unique())) u_count = u_count > 1 u_count.name = 'is_duplicated' df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True) From 9cb49c55ae7342f821127023fa650b7efb739243 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 26 Aug 2022 16:31:15 +0100 Subject: [PATCH 08/59] More explict handling of genome_build so that files from different builds can't be combined. --- .../scorefile/combine_scorefiles.py | 53 ++++++++++++---- pgscatalog_utils/scorefile/genome_build.py | 60 +++++++------------ pgscatalog_utils/scorefile/liftover.py | 6 +- pgscatalog_utils/scorefile/qc.py | 4 +- pgscatalog_utils/scorefile/read.py | 57 +++++++++++++++--- 5 files changed, 122 insertions(+), 58 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 925ee27..39637db 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -7,12 +7,14 @@ from pgscatalog_utils.log_config import set_logging_level from pgscatalog_utils.scorefile.read import load_scorefile +from pgscatalog_utils.scorefile.harmonised import remap_harmonised +from pgscatalog_utils.scorefile.qc import quality_control +from pgscatalog_utils.scorefile.genome_build import build2GRC from pgscatalog_utils.scorefile.effect_type import set_effect_type from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights from pgscatalog_utils.scorefile.liftover import liftover from pgscatalog_utils.scorefile.write import write_scorefile - def combine_scorefiles(): args = _parse_args() @@ -21,7 +23,43 @@ def combine_scorefiles(): paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") - scorefiles: pd.DataFrame = pd.concat([_read_and_melt(x, drop_missing=args.drop_missing) for x in paths]) + + scorefiles = [] + for x in paths: + # Read scorefile df and header + h, score = load_scorefile(x) + + # Check if we should use the harmonized positions + use_harmonised = False + current_build = None + if h.get('HmPOS_build') is not None: + if h.get('HmPOS_build') == args.target_build: + use_harmonised = True + current_build = h.get('HmPOS_build') + else: + logger.error(f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}") + raise Exception + + + + score = (score.pipe(remap_harmonised, use_harmonised=True) + .pipe(quality_control, drop_missing=args.drop_missing) + .pipe(melt_effect_weights) + .pipe(set_effect_type).assign(genome_build=current_build)) + # Check if the score is in the right build or could be lifted + if current_build is None: + current_build = build2GRC(h.get('genome_build')) + + if (current_build != args.target_build) and (args.liftover is False): + logger.error( + f"Cannot combine {x} (build={h.get('genome_build')}) with target build {args.target_build} without liftover") + logger.error("Try running with --liftover and specifying the --chain_dir") + raise Exception + + scorefiles.append(score) + + + scorefiles: pd.DataFrame = pd.concat(scorefiles) if args.liftover: logger.debug("Annotating scorefiles with liftover parameters") @@ -30,12 +68,6 @@ def combine_scorefiles(): write_scorefile(scorefiles, args.outfile) -def _read_and_melt(path, drop_missing: bool = False): - """ Load a scorefile, melt it, and set the effect types""" - return (load_scorefile(path, drop_missing=drop_missing) - .pipe(melt_effect_weights) - .pipe(set_effect_type)) - if __name__ == "__main__": combine_scorefiles() @@ -68,8 +100,9 @@ def _parse_args(args=None) -> argparse.Namespace: help=' Scorefile path (wildcard * is OK)', required=True) parser.add_argument('--liftover', dest='liftover', help=' Convert scoring file variants to target genome build?', action='store_true') - parser.add_argument('-t', '--target_build', dest='target_build', help='Build of target genome ', - required='--liftover' in sys.argv) + parser.add_argument('-t', '--target_build', dest='target_build', + choices=['GRCh37', 'GRCh38'], help='Build of target genome', + required=True) parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files', required="--liftover" in sys.argv) parser.add_argument('-m', '--min_lift', dest='min_lift', diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py index 9ded7ed..d145a2f 100644 --- a/pgscatalog_utils/scorefile/genome_build.py +++ b/pgscatalog_utils/scorefile/genome_build.py @@ -1,55 +1,41 @@ -import gzip -import io import logging -import re -from typing import TextIO import pandas as pd +from pgscatalog_utils.scorefile.read import _read_header + logger = logging.getLogger(__name__) def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame: - """ Annotate the dataframe with genome build data """ + """ Annotate the dataframe with genome build data """ logger.debug(f"Annotating target build: {target_build}") build_dict: dict = {'GRCh37': 'hg19', 'GRCh38': 'hg38', 'hg19': 'hg19', 'hg38': 'hg38'} # standardise build names - df['target_build'] = build_dict[target_build] - - builds: pd.DataFrame = _get_builds(df['filename'].drop_duplicates()) - builds['genome_build'] = builds.apply(lambda x: build_dict[x.genome_build], axis=1) - return df.merge(builds, how="left", on="filename") - - -def _read_header(f: TextIO) -> str: - """ Extract genome build of scorefile from PGS Catalog header format """ - for line in f: - if re.search("^#genome_build", line): - # get #genome_build=GRCh37 from header - header = line.replace('\n', '').replace('#', '').split('=') - # and remap to liftover style - try: - build: str = header[-1] - logger.debug(f"Valid genome build detected: {build}") - return build - except KeyError: - raise Exception("Bad genome build detected in header") - elif line[0] != '#': - raise Exception("No genome build detected in header") + df['chain_target_build'] = build_dict[target_build] + df = df.assign(chain_genome_build=[build_dict[x] for x in df['genome_build']]) + return df + +def build2GRC(build): + """Map build names so they can be compared with GRCh37 and 38""" + build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', 'hg38': 'GRCh38'} # standardise build names + if build is None: + return None + else: + return build_2_GRC_dict.get(build) + def _read_build(path: str) -> str: """ Open scorefiles and automatically handle compressed input """ logger.debug(f'Reading header of {path}') - try: - with io.TextIOWrapper(gzip.open(path, 'r')) as f: - return _read_header(f) - except gzip.BadGzipFile: - with open(path, 'r') as f: - return _read_header(f) + h = _read_header(path) + return {k: h.get(k, None) for k in ('genome_build', 'HmPOS_build')} -def _get_builds(s: pd.Series) -> pd.DataFrame: +def _get_builds(paths: list) -> pd.DataFrame: """ Get genome builds for a series of scorefile paths - | filename | -> | filename | genome_build | - | x.txt.gz | | x.txt.gz | hg19 | + | filename | -> | | genome_build | HmPOS_build | + | x.txt.gz | | x.txt.gz | hg19 | None | + | x_hmPOS_GRCh37.txt.gz | | x_hmPOS_GRCh37.txt.gz | hg19 | GRCh37 | """ - return pd.concat([s, s.apply(_read_build).rename("genome_build")], axis=1) + return pd.DataFrame.from_dict({path: _read_build(path) for path in paths}, orient='index') + diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 0d3008c..2680a09 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -12,8 +12,8 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st df = annotate_build(df, target_build) # grab build from scoring file headers mapped, unmapped = pd.DataFrame(), pd.DataFrame() - no_liftover: pd.DataFrame = df.query('target_build == genome_build') - to_liftover: pd.DataFrame = df.query('target_build != genome_build') + no_liftover: pd.DataFrame = df.query('chain_target_build == chain_genome_build') + to_liftover: pd.DataFrame = df.query('chain_target_build != chain_genome_build') if no_liftover.empty: logger.debug("Liftover required for all scorefile variants") @@ -65,7 +65,7 @@ def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver]) if df[['chr_name', 'chr_position']].isnull().values.any(): converted = None else: - lo = lo_dict[df['genome_build'] + df['target_build']] # extract lo object from dict + lo = lo_dict[df['chain_genome_build'] + df['chain_target_build']] # extract lo object from dict chrom: str = 'chr' + str(df['chr_name']) pos: int = int(df['chr_position']) - 1 # liftOver is 0 indexed, VCF is 1 indexed # converted example: [('chr22', 15460378, '+', 3320966530)] or None diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index ff5c942..36b20c0 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -84,6 +84,8 @@ def _check_shape(df: pd.DataFrame) -> None: def _check_columns(df: pd.DataFrame) -> None: - assert {'chr_name', 'chr_position'}.issubset(df.columns), "If you're using rsids did you request harmonised data?" + assert {'chr_name', 'chr_position'}.issubset(df.columns), "Missing chromsomal positions. If you're " \ + "using PGS Catalog files with rsIDs you should request " \ + "harmonised data files (HmPOS) instead." assert 'effect_allele' in df, "ERROR: Missing effect allele column" diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py index 7674c7c..a3e6997 100644 --- a/pgscatalog_utils/scorefile/read.py +++ b/pgscatalog_utils/scorefile/read.py @@ -1,20 +1,46 @@ import os import pandas as pd import logging -from .harmonised import remap_harmonised -from .qc import quality_control + +import gzip +import io logger = logging.getLogger(__name__) -def load_scorefile(path: str, use_harmonised: bool = True, drop_missing: bool = False) -> pd.DataFrame: +def load_scorefile(path: str) -> pd.DataFrame: logger.debug(f'Reading scorefile {path}') - return (pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) - .pipe(remap_harmonised, use_harmonised=use_harmonised) + return (_read_header(path), + pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) .assign(filename_prefix=_get_basename(path), - filename=path) - .pipe(quality_control, drop_missing=drop_missing)) + filename=path)) + + +def _read_header(path: str) -> dict: + """Parses the header of a PGS Catalog format scorefle into a dictionary""" + try: + f = io.TextIOWrapper(gzip.open(path, 'r')) + except gzip.BadGzipFile: + f = open(path, 'r') + + header = {} + lastline = '#' + while lastline.startswith('#'): + lastline = f.readline() + line = lastline.strip() + if line.startswith('#'): + if '=' in line: + line = line[1:].split('=') + field, val = [x.strip() for x in line] + if field in remap_header: + header[remap_header[field]] = val + else: + header[field] = val + if ('genome_build' in header) and (header['genome_build'] == 'NR'): + header['genome_build'] = None + f.close() + return header def _scorefile_dtypes() -> dict[str]: """ Data types for columns that might be found in a scorefile """ @@ -27,3 +53,20 @@ def _get_basename(path: str) -> str: """ Return the basename of a scoring file without extension """ return os.path.basename(path).split('.')[0] +remap_header = { + 'PGS ID': 'pgs_id', + 'PGS Name': 'pgs_name', + 'Reported Trait': 'trait_reported', + 'Original Genome Build': 'genome_build', + 'Number of Variants': 'variants_number', + 'PGP ID': 'pgp_id', + 'Citation': 'citation', + 'LICENSE': 'license', + # Harmonization related + 'HmPOS Build': 'HmPOS_build', + 'HmPOS Date':'HmPOS_date', + 'HmVCF Reference': 'HmVCF_ref', + 'HmVCF Date': 'HmVCF_date', + 'HmVCF N Matched Variants': 'HmVCF_n_matched', + 'HmVCF N Unmapped Variants': 'HmVCF_n_unmapped' +} # Used to maintain reverse compatibility to old scoring files \ No newline at end of file From 4813c2bdf781c35089dfcd3be10b2df9a95b8bb8 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 26 Aug 2022 16:37:26 +0100 Subject: [PATCH 09/59] Set genome_build in the correct place --- pgscatalog_utils/scorefile/combine_scorefiles.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 39637db..9ab594c 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -40,15 +40,16 @@ def combine_scorefiles(): logger.error(f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}") raise Exception - - + # Process/QC score and check variant columns score = (score.pipe(remap_harmonised, use_harmonised=True) .pipe(quality_control, drop_missing=args.drop_missing) .pipe(melt_effect_weights) - .pipe(set_effect_type).assign(genome_build=current_build)) + .pipe(set_effect_type)) + # Check if the score is in the right build or could be lifted if current_build is None: current_build = build2GRC(h.get('genome_build')) + score = score.assign(genome_build=current_build) if (current_build != args.target_build) and (args.liftover is False): logger.error( From 9c33eee0b98e494513da0ebfcec6fbb61ec2a099 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 26 Aug 2022 17:09:57 +0100 Subject: [PATCH 10/59] prune duplicate variants after match prioritisation --- pgscatalog_utils/match/match.py | 2 -- pgscatalog_utils/match/postprocess.py | 40 ++++++++++++++++++++------- pgscatalog_utils/match/preprocess.py | 8 ------ pgscatalog_utils/match/read.py | 9 +++--- 4 files changed, 35 insertions(+), 24 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 6a3f70c..9d92719 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -65,8 +65,6 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap return (matches.with_column(pl.col('accession').cast(str)) .join(pass_df, on='accession', how='left')) - - def _match_keys(): return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'accession', 'effect_type', 'effect_weight'] diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index 33a0220..e002d27 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -30,28 +30,48 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: return (df.with_column( pl.when(pl.col("REF_FLIP") == pl.col("ALT")) .then(pl.col("ambiguous")) - .otherwise(False))).pipe(_get_distinct_weights) + .otherwise(False))).pipe(_prune_matches) -def _get_distinct_weights(df: pl.DataFrame) -> pl.DataFrame: +def _prune_matches(df: pl.DataFrame) -> pl.DataFrame: """ Select single matched variant in target for each variant in the scoring file (e.g. per accession) """ - count: pl.DataFrame = df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count() - singletons: pl.DataFrame = (count.filter(pl.col('count') == 1)[:, "accession":"effect_allele"] - .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) - - dups: pl.DataFrame = (count.filter(pl.col('count') > 1)[:, "accession":"effect_allele"] - .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) + dups: pl.DataFrame = _get_duplicate_variants(df) if dups: - distinct: pl.DataFrame = pl.concat([singletons, _prioritise_match_type(dups)]) + logger.debug("First match pruning: prioritise by match types") + singletons: pl.DataFrame = _get_singleton_variants(df) + prioritised: pl.DataFrame = _prioritise_match_type(dups) + prioritised_dups: pl.DataFrame = _get_duplicate_variants(prioritised) + if prioritised_dups: + logger.debug("Final match pruning: dropping any duplicates remaining") + prioritised_singletons: pl.DataFrame = _get_singleton_variants(prioritised) + distinct: pl.DataFrame = pl.concat([singletons, prioritised_singletons]) + else: + logger.debug("Final match pruning skipped (not required)") + distinct: pl.DataFrame = pl.concat([singletons, prioritised]) else: - distinct: pl.DataFrame = singletons + distinct: pl.DataFrame = df assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant" + logger.debug("Match pruning complete") return distinct +def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame: + return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']) + .count() + .filter(pl.col('count') == 1)[:, "accession":"effect_allele"] + .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) + + +def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame: + return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']) + .count() + .filter(pl.col('count') > 1)[:, "accession":"effect_allele"] + .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) + + def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame: dup_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") != None) dup_no_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") == None) diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 3cc66f7..29579b2 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -51,13 +51,5 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) return df -def check_weights(df: pl.DataFrame) -> None: - """ Checks weights for scoring file variants that could be matched (e.g. have a chr & pos) """ - weight_count = df.filter(pl.col('chr_name').is_not_null() & pl.col('chr_position').is_not_null()).groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count() - if any(weight_count['count'] > 1): - logger.error("Multiple effect weights per variant per accession detected in files: {}".format(list(weight_count.filter(pl.col('count') > 1)['accession'].unique()))) - raise Exception - - def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame: df.with_column(pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic')) \ No newline at end of file diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index edb69b5..f8f5b3e 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -4,7 +4,7 @@ import polars as pl -from pgscatalog_utils.match.preprocess import handle_multiallelic, check_weights, complement_valid_alleles +from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles logger = logging.getLogger(__name__) @@ -33,10 +33,12 @@ def read_target(path: str, remove_multiallelic: bool, single_file: bool = False, match target.file_format: case 'bim': - return (df[_default_cols()] + return (df.select(_default_cols()) + .filter(pl.col('ID') != '.') # remove missing IDs .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=False)) case 'pvar': - return (df[_default_cols()] + return (df.select(_default_cols()) + .filter(pl.col('ID') != '.') .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=True)) case _: logger.error("Invalid file format detected") @@ -47,7 +49,6 @@ def read_scorefile(path: str) -> pl.DataFrame: logger.debug("Reading scorefile") scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str}) .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])) - check_weights(scorefile) return scorefile From 242e4a330087568f3499f770cb449162fe8e28f3 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 30 Aug 2022 12:57:34 +0100 Subject: [PATCH 11/59] Add explicit build choices to downloads --- pgscatalog_utils/download/download_scorefile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index 30f8ac8..fc35529 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -116,7 +116,7 @@ def _parse_args(args=None) -> argparse.Namespace: parser.add_argument('-t', '--efo', dest='efo', nargs='+', help='Traits described by an EFO term(s) (e.g. EFO_0004611)') parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+') - parser.add_argument('-b', '--build', dest='build', + parser.add_argument('-b', '--build', dest='build', choices=['GRCh37', 'GRCh38'], help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38') parser.add_argument('-o', '--outdir', dest='outdir', required=True, default='scores/', From 21dfcb8d3c79181c7bfd589443a1237769392856 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 30 Aug 2022 12:58:56 +0100 Subject: [PATCH 12/59] Make sure liftover works with chains and mixed files --- pgscatalog_utils/scorefile/combine_scorefiles.py | 12 ++++++++---- pgscatalog_utils/scorefile/genome_build.py | 2 +- pgscatalog_utils/scorefile/liftover.py | 8 ++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 9ab594c..9836cfe 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -46,10 +46,11 @@ def combine_scorefiles(): .pipe(melt_effect_weights) .pipe(set_effect_type)) - # Check if the score is in the right build or could be lifted + # Annotate score with the genome_build (in GRCh notation) if current_build is None: current_build = build2GRC(h.get('genome_build')) - score = score.assign(genome_build=current_build) + + score = score.assign(genome_build=current_build) if (current_build != args.target_build) and (args.liftover is False): logger.error( @@ -59,8 +60,11 @@ def combine_scorefiles(): scorefiles.append(score) - - scorefiles: pd.DataFrame = pd.concat(scorefiles) + if len(scorefiles) > 0: + scorefiles: pd.DataFrame = pd.concat(scorefiles) + else: + logger.error("No valid scorefiles could be combined") + raise Exception if args.liftover: logger.debug("Annotating scorefiles with liftover parameters") diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py index d145a2f..5fe4488 100644 --- a/pgscatalog_utils/scorefile/genome_build.py +++ b/pgscatalog_utils/scorefile/genome_build.py @@ -17,7 +17,7 @@ def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame: def build2GRC(build): """Map build names so they can be compared with GRCh37 and 38""" build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', 'hg38': 'GRCh38'} # standardise build names - if build is None: + if pd.isnull(build): return None else: return build_2_GRC_dict.get(build) diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 2680a09..ee8902e 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -9,7 +9,7 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: str) -> pd.DataFrame: """ Liftover genomic coordinates to a different genome build """ - df = annotate_build(df, target_build) # grab build from scoring file headers + df = annotate_build(df, target_build) # get chain_target_build (e.g. in hg notation to match chain files) mapped, unmapped = pd.DataFrame(), pd.DataFrame() no_liftover: pd.DataFrame = df.query('chain_target_build == chain_genome_build') @@ -19,15 +19,15 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st logger.debug("Liftover required for all scorefile variants") else: logger.debug("Skipping liftover for scorefiles with same build as target genome") - no_liftover[['lifted_chr', 'lifted_pos']] = no_liftover[['chr_name', 'chr_position']] # assume col structure + no_liftover.loc[:,['lifted_chr', 'lifted_pos']] = no_liftover[['chr_name', 'chr_position']] # assume col structure no_liftover.assign(liftover=None) if to_liftover.empty: logger.debug("Liftover skipped because no variants required it") else: + lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) # loads chain files logger.debug("Lifting over scoring files") - lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) - to_liftover[['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1) + to_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1) logger.debug("Liftover complete") mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] From 2461ffb96a3f1a33b16137ec7be2d36d076e076f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 30 Aug 2022 14:13:37 +0100 Subject: [PATCH 13/59] keep track of variants that pass pruning --- pgscatalog_utils/match/match.py | 8 +++-- pgscatalog_utils/match/match_variants.py | 22 +++++++----- pgscatalog_utils/match/postprocess.py | 46 ++++++++++++++++++------ pgscatalog_utils/scorefile/qc.py | 3 +- tests/test_match.py | 14 ++++---- 5 files changed, 63 insertions(+), 30 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 9d92719..ac566b2 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -9,7 +9,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool, - skip_flip: bool) -> pl.DataFrame: + skip_flip: bool, keep_first_match: bool) -> pl.DataFrame: scorefile_cat, target_cat = _cast_categorical(scorefile, target) scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) @@ -35,7 +35,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order)) matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order)) - return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous) + return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous, keep_first_match) def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> pl.DataFrame: @@ -58,6 +58,9 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap pass_df = pl.concat([pass_df, df]) logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)") + # TODO: fill nulls in certain columns with false in a nicer way + match_log['passes_pruning'] = match_log['passes_pruning'].fill_null(False) + # add match statistics to log and matches write_log((match_log.with_column(pl.col('accession').cast(str)) .join(pass_df, on='accession', how='left')), dataset) @@ -65,6 +68,7 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap return (matches.with_column(pl.col('accession').cast(str)) .join(pass_df, on='accession', how='left')) + def _match_keys(): return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'accession', 'effect_type', 'effect_weight'] diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 0d31da6..a0d0230 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -35,15 +35,16 @@ def match_variants(): match match_mode: case "single": logger.debug(f"Match mode: {match_mode}") - matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous, args.skip_flip) + matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous, + args.skip_flip, args.keep_first_match) case "multi": logger.debug(f"Match mode: {match_mode}") matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, - args.remove_ambiguous, args.skip_flip) + args.remove_ambiguous, args.skip_flip, args.keep_first_match) case "fast": logger.debug(f"Match mode: {match_mode}") matches = _fast_match(args.target, scorefile, args.remove_multiallelic, - args.remove_ambiguous, args.skip_flip) + args.remove_ambiguous, args.skip_flip, args.keep_first_match) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception @@ -69,37 +70,37 @@ def _check_target_chroms(target) -> None: def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame: + remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame: # fast match is fast because: # 1) all target files are read into memory # 2) matching occurs without iterating through chromosomes target: pl.DataFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic) logger.debug("Split target chromosomes not checked with fast match mode") - return get_all_matches(scorefile, target, remove_ambiguous, skip_filp) + return get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match) def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame: + remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame: matches = [] for i, loc_target_current in enumerate(glob(target_path)): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.DataFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic) # _check_target_chroms(target) - matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp)) + matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match)) return pl.concat(matches) def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame: + remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame: matches = [] for chrom in scorefile['chr_name'].unique().to_list(): target = read_target(target_path, remove_multiallelic=remove_multiallelic, single_file=True, chrom=chrom) # scans and filters if target: logger.debug(f"Matching chromosome {chrom}") - matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp)) + matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match)) return pl.concat(matches) @@ -181,6 +182,9 @@ def _parse_args(args=None): help=''' Flag to not consider matched variants that may be reported on the opposite strand. Default behaviour is to flip/complement unmatched variants and check if they match.''') + parser.add_argument('--keep_first_match', dest='keep_first_match', action='store_true', + help=''' If multiple match candidates for a variant exist that can't be prioritised, + keep the first match candidate (default: drop all candidates)''') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') return parser.parse_args(args) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index e002d27..b2e6472 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -7,8 +7,15 @@ logger = logging.getLogger(__name__) -def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame: - df = _label_biallelic_ambiguous(df) +def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: + """ Clean up match candidates ready for writing out, including: + + - Label ambiguous variants + - Prune match candidates to select the best match for each variant in the scoring file + - Optionally remove ambiguous variants + """ + df = _label_biallelic_ambiguous(df).pipe(_prune_matches, keep_first_match) + if remove_ambiguous: logger.debug("Removing ambiguous matches") return df.filter(pl.col("ambiguous") == False) @@ -30,11 +37,23 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: return (df.with_column( pl.when(pl.col("REF_FLIP") == pl.col("ALT")) .then(pl.col("ambiguous")) - .otherwise(False))).pipe(_prune_matches) + .otherwise(False))) + + +def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFrame: + """ Select the best match candidate in the target for each variant in the scoring file + + - In a scoring file (accession), each variant ID with the same effect allele and weight *must be unique* + - The variant matching process normally returns multiple match candidates for each variant ID, e.g.: + refalt > altref > refalt_flip > altref_flip + - When multiple match candidates for an ID exist, they must be prioritised and pruned to be unique + - If it's impossible to prioritise match candidates (i.e. same strategy is used), drop all matches by default + :param df: A dataframe containing multiple match candidates for each variant + :param drop_duplicates: If it's impossible to make match candidates unique, drop all candidates? + :return: A dataframe containing the best match candidate for each variant + """ -def _prune_matches(df: pl.DataFrame) -> pl.DataFrame: - """ Select single matched variant in target for each variant in the scoring file (e.g. per accession) """ dups: pl.DataFrame = _get_duplicate_variants(df) if dups: @@ -42,12 +61,15 @@ def _prune_matches(df: pl.DataFrame) -> pl.DataFrame: singletons: pl.DataFrame = _get_singleton_variants(df) prioritised: pl.DataFrame = _prioritise_match_type(dups) prioritised_dups: pl.DataFrame = _get_duplicate_variants(prioritised) - if prioritised_dups: - logger.debug("Final match pruning: dropping any duplicates remaining") - prioritised_singletons: pl.DataFrame = _get_singleton_variants(prioritised) - distinct: pl.DataFrame = pl.concat([singletons, prioritised_singletons]) + if prioritised_dups and not keep_first_match: + logger.debug("Final match pruning: dropping remaining duplicate matches") + distinct: pl.DataFrame = pl.concat([singletons, _get_singleton_variants(prioritised)]) + elif prioritised_dups and keep_first_match: + logger.debug("Final match pruning: keeping first match") + distinct: pl.DataFrame = pl.concat([singletons, _get_singleton_variants(prioritised), + prioritised.unique(maintain_order=True)]) else: - logger.debug("Final match pruning skipped (not required)") + logger.debug("Final match pruning unnecessary") distinct: pl.DataFrame = pl.concat([singletons, prioritised]) else: distinct: pl.DataFrame = df @@ -55,10 +77,11 @@ def _prune_matches(df: pl.DataFrame) -> pl.DataFrame: assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant" logger.debug("Match pruning complete") - return distinct + return distinct.with_column(pl.lit(True).alias('passes_pruning')) def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame: + """ Return variants with only one row (match candidate) per variant ID """ return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']) .count() .filter(pl.col('count') == 1)[:, "accession":"effect_allele"] @@ -66,6 +89,7 @@ def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame: def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame: + """ Return variants with more than one row (match candidate) per variant ID """ return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']) .count() .filter(pl.col('count') > 1)[:, "accession":"effect_allele"] diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index ff5c942..fa5a6a8 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -75,9 +75,10 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: u_count = u_count > 1 u_count.name = 'is_duplicated' df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True) - df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos + df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos return df + def _check_shape(df: pd.DataFrame) -> None: assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)" assert df.shape[0] > 0, "ERROR: No variants detected in input file (0 rows)" diff --git a/tests/test_match.py b/tests/test_match.py index 6f3394d..d437aa3 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -46,14 +46,14 @@ def test_match_strategies(small_scorefile, small_target): scorefile, target = _cast_cat(small_scorefile, small_target) # check unambiguous matches - df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True) + df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True, keep_first_match=False) assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'}) assert set(df['match_type'].to_list()).issubset(['altref', 'refalt']) # when keeping ambiguous and flipping alleles: # 2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip' # flipped matches should be dropped for ambiguous matches - flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False)\ + flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False)\ .filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'altref'}) @@ -62,12 +62,12 @@ def test_match_strategies(small_scorefile, small_target): def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) - df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True) + df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True, keep_first_match=False) assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref']) # one of the matches is ambiguous - flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False) + flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False) .filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'no_oa_alt'}) @@ -76,14 +76,14 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): def test_flip_match(small_flipped_scorefile, small_target): scorefile, target = _cast_cat(small_flipped_scorefile, small_target) - df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True) + df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True, keep_first_match=False) assert df.is_empty() - flip = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=False) + flip = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=False, keep_first_match=False) assert flip['match_type'].str.contains('flip').all() assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) - flip_ambig = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False) + flip_ambig = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False) .filter(pl.col('ambiguous') == True)) assert not flip_ambig['match_type'].str.contains('flip').any() # no flip matches for ambiguous From b190429d497e3fad9815a264a9441259eadae496 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 31 Aug 2022 15:16:46 +0100 Subject: [PATCH 14/59] fix liftover --- pgscatalog_utils/scorefile/liftover.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index ee8902e..df9f5e1 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -25,9 +25,9 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st if to_liftover.empty: logger.debug("Liftover skipped because no variants required it") else: - lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) # loads chain files + lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) # loads chain files logger.debug("Lifting over scoring files") - to_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1) + to_liftover[['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1) logger.debug("Liftover complete") mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] From dca90de499c72c9055a40298029bc9aa855ed0d7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 31 Aug 2022 15:17:23 +0100 Subject: [PATCH 15/59] fix reading plain text filesc --- pgscatalog_utils/scorefile/read.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py index a3e6997..61d6d44 100644 --- a/pgscatalog_utils/scorefile/read.py +++ b/pgscatalog_utils/scorefile/read.py @@ -1,14 +1,18 @@ import os +from typing import Tuple + import pandas as pd import logging import gzip import io +from pandas import DataFrame + logger = logging.getLogger(__name__) -def load_scorefile(path: str) -> pd.DataFrame: +def load_scorefile(path: str) -> tuple[dict, pd.DataFrame]: logger.debug(f'Reading scorefile {path}') return (_read_header(path), pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) @@ -18,8 +22,9 @@ def load_scorefile(path: str) -> pd.DataFrame: def _read_header(path: str) -> dict: """Parses the header of a PGS Catalog format scorefle into a dictionary""" + f = io.TextIOWrapper(gzip.open(path, 'r')) try: - f = io.TextIOWrapper(gzip.open(path, 'r')) + f.readline() except gzip.BadGzipFile: f = open(path, 'r') @@ -42,6 +47,7 @@ def _read_header(path: str) -> dict: f.close() return header + def _scorefile_dtypes() -> dict[str]: """ Data types for columns that might be found in a scorefile """ return {'rsID': str, 'chr_name': str, 'chr_position': pd.UInt64Dtype(), 'effect_allele': 'str', @@ -69,4 +75,4 @@ def _get_basename(path: str) -> str: 'HmVCF Date': 'HmVCF_date', 'HmVCF N Matched Variants': 'HmVCF_n_matched', 'HmVCF N Unmapped Variants': 'HmVCF_n_unmapped' -} # Used to maintain reverse compatibility to old scoring files \ No newline at end of file +} # Used to maintain reverse compatibility to old scoring files From d198b9cd2557d4e2b11a503efa744f5793b8df2b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 31 Aug 2022 15:17:59 +0100 Subject: [PATCH 16/59] fix tests --- conftest.py | 8 ++++---- tests/test_combine.py | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/conftest.py b/conftest.py index a30f2cd..96b33bd 100644 --- a/conftest.py +++ b/conftest.py @@ -66,7 +66,7 @@ def mini_score_path(tmp_path_factory): def mini_scorefile(mini_score_path, tmp_path_factory): # The mini scorefile overlaps well with cineca synthetic subset out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt" - args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())] + args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())] with patch('sys.argv', args): combine_scorefiles() @@ -78,7 +78,7 @@ def mini_scorefile(mini_score_path, tmp_path_factory): def combined_scorefile(scorefiles, tmp_path_factory): # The combined scorefile overlaps poorly with cineca synthetic subset out_path = tmp_path_factory.mktemp("scores") / "combined.txt" - args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['-o', str(out_path.resolve())] + args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())] with patch('sys.argv', args): combine_scorefiles() @@ -111,9 +111,9 @@ def chain_files(db, tmp_path_factory): @pytest.fixture(scope="session") -def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory): +def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory): out_path = tmp_path_factory.mktemp("scores") / "lifted.txt" - args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['--liftover', '-c', chain_files, '-t', 'GRCh38', + args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t', 'GRCh38', '-m', '0.8'] + ['-o', str(out_path.resolve())] with patch('sys.argv', args): diff --git a/tests/test_combine.py b/tests/test_combine.py index f9ee7a1..c76bcdc 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -1,8 +1,11 @@ +from unittest.mock import patch + import pandas as pd import pytest import jq from pgscatalog_utils.download.score import query_score +from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles def test_combine_scorefiles(combined_scorefile, _n_variants): @@ -15,7 +18,16 @@ def test_combine_scorefiles(combined_scorefile, _n_variants): def test_liftover(lifted_scorefiles): df = pd.read_table(lifted_scorefiles) - assert df.shape[0] > 50000 # approx size + assert df.shape[0] == 832 # approx size + + +def test_fail_combine(scorefiles, tmp_path_factory): + # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception + with pytest.raises(Exception): + out_path = tmp_path_factory.mktemp("scores") / "combined.txt" + args: list[str] = ['combine_scorefiles', '-t', 'GRCh38', '-s'] + scorefiles + ['-o', str(out_path.resolve())] + with patch('sys.argv', args): + combine_scorefiles() @pytest.fixture @@ -23,3 +35,4 @@ def _n_variants(pgs_accessions): json = query_score(pgs_accessions) n: list[int] = jq.compile("[.results][][].variants_number").input(json).all() return sum(n) + From df6b8699f7e868789e4cbe0e9016cc918600333e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 31 Aug 2022 15:18:17 +0100 Subject: [PATCH 17/59] fix calling _parse_args() --- .../scorefile/combine_scorefiles.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 9836cfe..6b27641 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -15,6 +15,7 @@ from pgscatalog_utils.scorefile.liftover import liftover from pgscatalog_utils.scorefile.write import write_scorefile + def combine_scorefiles(): args = _parse_args() @@ -73,16 +74,11 @@ def combine_scorefiles(): write_scorefile(scorefiles, args.outfile) - -if __name__ == "__main__": - combine_scorefiles() - - def _description_text() -> str: return textwrap.dedent('''\ Combine multiple scoring files in PGS Catalog format (see https://www.pgscatalog.org/downloads/ for details) to a 'long' table of columns needed for variant matching and subsequent calculation. - + Custom scorefiles in PGS Catalog format can be combined with PGS Catalog scoring files, and optionally liftover genomic coordinates to GRCh37 or GRCh38. The script can accept a mix of unharmonised and harmonised PGS Catalog data. By default all variants are output (including @@ -106,15 +102,15 @@ def _parse_args(args=None) -> argparse.Namespace: parser.add_argument('--liftover', dest='liftover', help=' Convert scoring file variants to target genome build?', action='store_true') parser.add_argument('-t', '--target_build', dest='target_build', - choices=['GRCh37', 'GRCh38'], help='Build of target genome', + choices=['GRCh37', 'GRCh38'], help=' Build of target genome', required=True) parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files', required="--liftover" in sys.argv) parser.add_argument('-m', '--min_lift', dest='min_lift', - help='If liftover, minimum proportion of variants lifted over', + help=' If liftover, minimum proportion of variants lifted over', required="--liftover" in sys.argv, default=0.95, type=float) parser.add_argument('--drop_missing', dest='drop_missing', action='store_true', - help='Drop variants with missing information (chr/pos) and ' + help=' Drop variants with missing information (chr/pos) and ' 'non-standard alleles (e.g. HLA=P/N) from the output file.') parser.add_argument('-o', '--outfile', dest='outfile', required=True, default='combined.txt', @@ -123,3 +119,8 @@ def _parse_args(args=None) -> argparse.Namespace: parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') return parser.parse_args(args) + + +if __name__ == "__main__": + combine_scorefiles() + From 1be72e970b569a5d29ba3deb368a86927ceee310 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 31 Aug 2022 15:46:22 +0100 Subject: [PATCH 18/59] fix test_liftover --- conftest.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index 96b33bd..4bde081 100644 --- a/conftest.py +++ b/conftest.py @@ -123,15 +123,11 @@ def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory): @pytest.fixture(scope="session") -def hg38_coords(tmp_path_factory): - out_path = tmp_path_factory.mktemp("dummy") / "hg38.txt" +def hg38_coords(): d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]} df = pd.DataFrame(d) - with open(out_path, 'w') as f: - f.write('#genome_build=GRCh38\n') - df.to_csv(out_path, mode='a', index=False) - df['filename'] = str(out_path.resolve()) df['accession'] = 'dummy' + df['genome_build'] = 'GRCh38' return df From 258db50d432a695d789b0f4aef27801b7a41728f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 31 Aug 2022 15:49:18 +0100 Subject: [PATCH 19/59] reformat and optimise imports --- .github/workflows/main.yml | 2 +- conftest.py | 19 ++++++----- pgscatalog_utils/download/publication.py | 6 ++-- pgscatalog_utils/download/score.py | 10 +++--- pgscatalog_utils/download/trait.py | 3 +- pgscatalog_utils/log_config.py | 2 +- pgscatalog_utils/match/match.py | 1 - pgscatalog_utils/match/match_variants.py | 3 +- pgscatalog_utils/match/postprocess.py | 3 +- pgscatalog_utils/match/preprocess.py | 32 +++++++++++-------- pgscatalog_utils/match/write.py | 14 ++++---- .../scorefile/combine_scorefiles.py | 18 +++++------ pgscatalog_utils/scorefile/effect_type.py | 3 +- pgscatalog_utils/scorefile/effect_weight.py | 5 ++- pgscatalog_utils/scorefile/genome_build.py | 9 +++--- pgscatalog_utils/scorefile/harmonised.py | 3 +- pgscatalog_utils/scorefile/liftover.py | 17 ++++++---- pgscatalog_utils/scorefile/qc.py | 7 ++-- pgscatalog_utils/scorefile/read.py | 13 +++----- pgscatalog_utils/scorefile/write.py | 5 ++- tests/test_combine.py | 3 +- tests/test_download.py | 8 +++-- tests/test_liftover.py | 1 + tests/test_match.py | 9 +++--- 24 files changed, 105 insertions(+), 91 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6477922..bf0f138 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,5 @@ name: CI -on: [push] +on: [ push ] jobs: ci: diff --git a/conftest.py b/conftest.py index 4bde081..e322b96 100644 --- a/conftest.py +++ b/conftest.py @@ -1,12 +1,14 @@ -import pytest -from unittest.mock import patch -from pgscatalog_utils.download.download_scorefile import download_scorefile +import glob import os +from unittest.mock import patch + +import pandas as pd +import pytest import requests as req -from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles from pysqlar import SQLiteArchive -import pandas as pd -import glob + +from pgscatalog_utils.download.download_scorefile import download_scorefile +from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles @pytest.fixture(scope="session") @@ -113,8 +115,9 @@ def chain_files(db, tmp_path_factory): @pytest.fixture(scope="session") def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory): out_path = tmp_path_factory.mktemp("scores") / "lifted.txt" - args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t', 'GRCh38', - '-m', '0.8'] + ['-o', str(out_path.resolve())] + args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t', + 'GRCh38', + '-m', '0.8'] + ['-o', str(out_path.resolve())] with patch('sys.argv', args): combine_scorefiles() diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py index b5e90fa..843b8a2 100644 --- a/pgscatalog_utils/download/publication.py +++ b/pgscatalog_utils/download/publication.py @@ -1,7 +1,8 @@ -import requests import logging from functools import reduce +import requests + logger = logging.getLogger(__name__) @@ -17,6 +18,3 @@ def query_publication(pgp: str) -> list[str]: pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids') logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}") return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values())) - - - diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py index 61a0154..a38dc0c 100644 --- a/pgscatalog_utils/download/score.py +++ b/pgscatalog_utils/download/score.py @@ -1,8 +1,9 @@ -import requests import logging -import jq import sys +import jq +import requests + logger = logging.getLogger(__name__) @@ -36,7 +37,7 @@ def query_score(pgs_id: list[str]) -> dict: def _chunker(pgs: list[str]): size = 50 # /rest/score/{pgs_id} limit when searching multiple IDs - return(pgs[pos: pos + size] for pos in range(0, len(pgs), size)) + return (pgs[pos: pos + size] for pos in range(0, len(pgs), size)) def _parse_json_query(json: dict, build: str | None) -> dict[str, str]: @@ -53,5 +54,6 @@ def _extract_ftp_url(json: list[dict], build: str | None) -> dict[str, str]: result: list[str] = jq.compile(f'[.results][][].ftp_scoring_file').input( json).all() else: - result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all() + result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input( + json).all() return dict(zip(id, [x.replace('https', 'ftp') for x in result])) diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py index 981b40d..c2db495 100644 --- a/pgscatalog_utils/download/trait.py +++ b/pgscatalog_utils/download/trait.py @@ -1,7 +1,8 @@ -import requests import logging from functools import reduce +import requests + logger = logging.getLogger(__name__) diff --git a/pgscatalog_utils/log_config.py b/pgscatalog_utils/log_config.py index f1509a9..dcd9cbe 100644 --- a/pgscatalog_utils/log_config.py +++ b/pgscatalog_utils/log_config.py @@ -12,4 +12,4 @@ def set_logging_level(verbose: bool): else: logging.basicConfig(level=logging.WARNING, format=log_fmt, - datefmt='%Y-%m-%d %H:%M:%S') \ No newline at end of file + datefmt='%Y-%m-%d %H:%M:%S') diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 6a3f70c..927327c 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -66,7 +66,6 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap .join(pass_df, on='accession', how='left')) - def _match_keys(): return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'accession', 'effect_type', 'effect_weight'] diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 0d31da6..c2a3381 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -35,7 +35,8 @@ def match_variants(): match match_mode: case "single": logger.debug(f"Match mode: {match_mode}") - matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous, args.skip_flip) + matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous, + args.skip_flip) case "multi": logger.debug(f"Match mode: {match_mode}") matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index 33a0220..13ef74c 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -1,6 +1,7 @@ +import logging from functools import reduce + import polars as pl -import logging from pgscatalog_utils.match.preprocess import complement_valid_alleles diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 3cc66f7..0b073fc 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -1,6 +1,7 @@ -import polars as pl import logging +import polars as pl + logger = logging.getLogger(__name__) @@ -12,16 +13,16 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF new_col = col + '_FLIP' df = df.with_column( pl.when(pl.col(col).str.contains('^[ACGT]+$')) - .then(pl.col(col).str.replace_all("A", "V") - .str.replace_all("T", "X") - .str.replace_all("C", "Y") - .str.replace_all("G", "Z") - .str.replace_all("V", "T") - .str.replace_all("X", "A") - .str.replace_all("Y", "G") - .str.replace_all("Z", "C")) - .otherwise(pl.col(col)) - .alias(new_col) + .then(pl.col(col).str.replace_all("A", "V") + .str.replace_all("T", "X") + .str.replace_all("C", "Y") + .str.replace_all("G", "Z") + .str.replace_all("V", "T") + .str.replace_all("X", "A") + .str.replace_all("Y", "G") + .str.replace_all("Z", "C")) + .otherwise(pl.col(col)) + .alias(new_col) ) return df @@ -53,11 +54,14 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) def check_weights(df: pl.DataFrame) -> None: """ Checks weights for scoring file variants that could be matched (e.g. have a chr & pos) """ - weight_count = df.filter(pl.col('chr_name').is_not_null() & pl.col('chr_position').is_not_null()).groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count() + weight_count = df.filter(pl.col('chr_name').is_not_null() & pl.col('chr_position').is_not_null()).groupby( + ['accession', 'chr_name', 'chr_position', 'effect_allele']).count() if any(weight_count['count'] > 1): - logger.error("Multiple effect weights per variant per accession detected in files: {}".format(list(weight_count.filter(pl.col('count') > 1)['accession'].unique()))) + logger.error("Multiple effect weights per variant per accession detected in files: {}".format( + list(weight_count.filter(pl.col('count') > 1)['accession'].unique()))) raise Exception def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame: - df.with_column(pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic')) \ No newline at end of file + df.with_column( + pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic')) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 110e308..1935bd5 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -1,7 +1,8 @@ -import polars as pl import logging import os +import polars as pl + logger = logging.getLogger(__name__) @@ -56,9 +57,10 @@ def _format_scorefile(df: pl.DataFrame, split: bool) -> dict[str, pl.DataFrame]: for x in chroms} else: logger.debug("Split output not requested") - formatted: pl.DataFrame = (df.pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") - .rename({"matched_effect_allele": "effect_allele"}) - .fill_null(strategy="zero")) + formatted: pl.DataFrame = ( + df.pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") + .rename({"matched_effect_allele": "effect_allele"}) + .fill_null(strategy="zero")) return {'false': formatted} @@ -87,8 +89,8 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra # 2. use cumcount to number duplicate IDs # 3. join cumcount data on original DF, use this data for splitting ea_count: pl.DataFrame = (df.select(["ID", "effect_allele"]) - .distinct() - .with_columns([ + .distinct() + .with_columns([ pl.col("ID").cumcount().over(["ID"]).alias("cumcount"), pl.col("ID").count().over(["ID"]).alias("count") ])) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 6b27641..6efeb51 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -6,13 +6,13 @@ import pandas as pd from pgscatalog_utils.log_config import set_logging_level -from pgscatalog_utils.scorefile.read import load_scorefile -from pgscatalog_utils.scorefile.harmonised import remap_harmonised -from pgscatalog_utils.scorefile.qc import quality_control -from pgscatalog_utils.scorefile.genome_build import build2GRC from pgscatalog_utils.scorefile.effect_type import set_effect_type from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights +from pgscatalog_utils.scorefile.genome_build import build2GRC +from pgscatalog_utils.scorefile.harmonised import remap_harmonised from pgscatalog_utils.scorefile.liftover import liftover +from pgscatalog_utils.scorefile.qc import quality_control +from pgscatalog_utils.scorefile.read import load_scorefile from pgscatalog_utils.scorefile.write import write_scorefile @@ -38,14 +38,15 @@ def combine_scorefiles(): use_harmonised = True current_build = h.get('HmPOS_build') else: - logger.error(f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}") + logger.error( + f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}") raise Exception # Process/QC score and check variant columns score = (score.pipe(remap_harmonised, use_harmonised=True) - .pipe(quality_control, drop_missing=args.drop_missing) - .pipe(melt_effect_weights) - .pipe(set_effect_type)) + .pipe(quality_control, drop_missing=args.drop_missing) + .pipe(melt_effect_weights) + .pipe(set_effect_type)) # Annotate score with the genome_build (in GRCh notation) if current_build is None: @@ -123,4 +124,3 @@ def _parse_args(args=None) -> argparse.Namespace: if __name__ == "__main__": combine_scorefiles() - diff --git a/pgscatalog_utils/scorefile/effect_type.py b/pgscatalog_utils/scorefile/effect_type.py index 78bce7f..50c8c73 100644 --- a/pgscatalog_utils/scorefile/effect_type.py +++ b/pgscatalog_utils/scorefile/effect_type.py @@ -1,6 +1,7 @@ -import pandas as pd import logging +import pandas as pd + logger = logging.getLogger(__name__) diff --git a/pgscatalog_utils/scorefile/effect_weight.py b/pgscatalog_utils/scorefile/effect_weight.py index 2693ec6..4b95e0f 100644 --- a/pgscatalog_utils/scorefile/effect_weight.py +++ b/pgscatalog_utils/scorefile/effect_weight.py @@ -1,5 +1,6 @@ -import re import logging +import re + import pandas as pd logger = logging.getLogger(__name__) @@ -46,5 +47,3 @@ def _melt(df: pd.DataFrame) -> pd.DataFrame: """ Melt a multiple effect weight format """ ew_cols: list[str] = df.filter(regex="effect_weight_*").columns.to_list() return df.melt(value_vars=ew_cols, value_name="effect_weight", var_name="accession") - - diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py index 5fe4488..06c3141 100644 --- a/pgscatalog_utils/scorefile/genome_build.py +++ b/pgscatalog_utils/scorefile/genome_build.py @@ -1,4 +1,5 @@ import logging + import pandas as pd from pgscatalog_utils.scorefile.read import _read_header @@ -14,17 +15,18 @@ def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame: df = df.assign(chain_genome_build=[build_dict[x] for x in df['genome_build']]) return df + def build2GRC(build): """Map build names so they can be compared with GRCh37 and 38""" - build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', 'hg38': 'GRCh38'} # standardise build names + build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', + 'hg38': 'GRCh38'} # standardise build names if pd.isnull(build): return None else: return build_2_GRC_dict.get(build) - -def _read_build(path: str) -> str: +def _read_build(path: str) -> dict[str, str]: """ Open scorefiles and automatically handle compressed input """ logger.debug(f'Reading header of {path}') h = _read_header(path) @@ -38,4 +40,3 @@ def _get_builds(paths: list) -> pd.DataFrame: | x_hmPOS_GRCh37.txt.gz | | x_hmPOS_GRCh37.txt.gz | hg19 | GRCh37 | """ return pd.DataFrame.from_dict({path: _read_build(path) for path in paths}, orient='index') - diff --git a/pgscatalog_utils/scorefile/harmonised.py b/pgscatalog_utils/scorefile/harmonised.py index bc9c329..b56fb93 100644 --- a/pgscatalog_utils/scorefile/harmonised.py +++ b/pgscatalog_utils/scorefile/harmonised.py @@ -1,5 +1,6 @@ -import re import logging +import re + import pandas as pd logger = logging.getLogger(__name__) diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index df9f5e1..8dfcdd6 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -1,8 +1,10 @@ +import logging +import os + import pandas as pd import pyliftover -import os -import logging -from .genome_build import annotate_build + +from pgscatalog_utils.scorefile.genome_build import annotate_build logger = logging.getLogger(__name__) @@ -19,7 +21,8 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st logger.debug("Liftover required for all scorefile variants") else: logger.debug("Skipping liftover for scorefiles with same build as target genome") - no_liftover.loc[:,['lifted_chr', 'lifted_pos']] = no_liftover[['chr_name', 'chr_position']] # assume col structure + no_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = no_liftover[ + ['chr_name', 'chr_position']] # assume col structure no_liftover.assign(liftover=None) if to_liftover.empty: @@ -32,7 +35,7 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] .assign(liftover=True)) - unmapped: pd.DataFrame = (to_liftover[to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)]\ + unmapped: pd.DataFrame = (to_liftover[to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] \ .assign(liftover=False)) _check_min_liftover(mapped, unmapped, min_lift) @@ -45,7 +48,7 @@ def _check_min_liftover(mapped: pd.DataFrame, unmapped: pd.DataFrame, min_lift: n_variants: pd.DataFrame = (pd.DataFrame(df.groupby('accession')['liftover'].count()) .reset_index() .rename({'liftover': 'n_var'}, axis=1)) - lo_counts = (pd.DataFrame(df.groupby(['accession', 'liftover'])['liftover'].count())\ + lo_counts = (pd.DataFrame(df.groupby(['accession', 'liftover'])['liftover'].count()) \ .rename_axis(['accession', 'liftover_status']) .reset_index()) summary: pd.DataFrame = lo_counts.merge(n_variants, on='accession') @@ -91,7 +94,7 @@ def _parse_lifted_chrom(i: str) -> str: def _create_liftover(chain_dir: str) -> dict['str': pyliftover.LiftOver]: """ Create LiftOver objects that can remap genomic coordinates """ builds: list[str] = ["hg19hg38", "hg38hg19"] - chains: list[str] = [os.path.join(chain_dir, x) for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"]] + chains: list[str] = [os.path.join(chain_dir, x) for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"]] lo: list[pyliftover.LiftOver] = [pyliftover.LiftOver(x) for x in chains] logger.debug("Chain files loaded for liftover") return dict(zip(builds, lo)) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 36b20c0..f88636d 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -1,6 +1,7 @@ -import pandas as pd import logging +import pandas as pd + logger = logging.getLogger(__name__) @@ -75,9 +76,10 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: u_count = u_count > 1 u_count.name = 'is_duplicated' df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True) - df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos + df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos return df + def _check_shape(df: pd.DataFrame) -> None: assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)" assert df.shape[0] > 0, "ERROR: No variants detected in input file (0 rows)" @@ -88,4 +90,3 @@ def _check_columns(df: pd.DataFrame) -> None: "using PGS Catalog files with rsIDs you should request " \ "harmonised data files (HmPOS) instead." assert 'effect_allele' in df, "ERROR: Missing effect allele column" - diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py index 61d6d44..d5c2b39 100644 --- a/pgscatalog_utils/scorefile/read.py +++ b/pgscatalog_utils/scorefile/read.py @@ -1,13 +1,9 @@ -import os -from typing import Tuple - -import pandas as pd -import logging - import gzip import io +import logging +import os -from pandas import DataFrame +import pandas as pd logger = logging.getLogger(__name__) @@ -59,6 +55,7 @@ def _get_basename(path: str) -> str: """ Return the basename of a scoring file without extension """ return os.path.basename(path).split('.')[0] + remap_header = { 'PGS ID': 'pgs_id', 'PGS Name': 'pgs_name', @@ -70,7 +67,7 @@ def _get_basename(path: str) -> str: 'LICENSE': 'license', # Harmonization related 'HmPOS Build': 'HmPOS_build', - 'HmPOS Date':'HmPOS_date', + 'HmPOS Date': 'HmPOS_date', 'HmVCF Reference': 'HmVCF_ref', 'HmVCF Date': 'HmVCF_date', 'HmVCF N Matched Variants': 'HmVCF_n_matched', diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index f9762b1..3f23830 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -1,6 +1,6 @@ -import pandas as pd import logging -import sqlite3 + +import pandas as pd logger = logging.getLogger(__name__) @@ -34,4 +34,3 @@ def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame: return df.query('liftover == True') else: return df - diff --git a/tests/test_combine.py b/tests/test_combine.py index c76bcdc..ae7de87 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -1,8 +1,8 @@ from unittest.mock import patch +import jq import pandas as pd import pytest -import jq from pgscatalog_utils.download.score import query_score from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles @@ -35,4 +35,3 @@ def _n_variants(pgs_accessions): json = query_score(pgs_accessions) n: list[int] = jq.compile("[.results][][].variants_number").input(json).all() return sum(n) - diff --git a/tests/test_download.py b/tests/test_download.py index 611740e..13fdeeb 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -1,11 +1,12 @@ import os -import pytest from unittest.mock import patch -from pgscatalog_utils.download.trait import query_trait +import pytest + +from pgscatalog_utils.download.download_scorefile import download_scorefile from pgscatalog_utils.download.publication import query_publication from pgscatalog_utils.download.score import get_url -from pgscatalog_utils.download.download_scorefile import download_scorefile +from pgscatalog_utils.download.trait import query_trait @pytest.fixture(params=[["PGS000001"], ["PGS000001", "PGS000802"]]) @@ -32,6 +33,7 @@ def test_download_scorefile_author(tmp_path): download_scorefile() assert os.listdir(out_dir) == ['PGS000001.txt.gz'] + def test_download_scorefile_hmPOS(tmp_path): out_dir = str(tmp_path.resolve()) args: list[str] = ['download_scorefiles', '-i', 'PGS000001', '-b', 'GRCh38', '-o', out_dir] diff --git a/tests/test_liftover.py b/tests/test_liftover.py index 66ebac5..b2f03a0 100644 --- a/tests/test_liftover.py +++ b/tests/test_liftover.py @@ -1,4 +1,5 @@ import pandas as pd + from pgscatalog_utils.scorefile.liftover import liftover diff --git a/tests/test_match.py b/tests/test_match.py index 6f3394d..717adfb 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -1,5 +1,5 @@ -import os from unittest.mock import patch + import polars as pl import pytest @@ -53,8 +53,8 @@ def test_match_strategies(small_scorefile, small_target): # when keeping ambiguous and flipping alleles: # 2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip' # flipped matches should be dropped for ambiguous matches - flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False)\ - .filter(pl.col('ambiguous') == True)) + flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False) \ + .filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'altref'}) @@ -62,7 +62,7 @@ def test_match_strategies(small_scorefile, small_target): def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) - df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True) + df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True) assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref']) @@ -123,4 +123,3 @@ def small_target(): "ALT": ["C", "A", "G"], "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"], "is_multiallelic": [False, False, False]}) - From 2bf12201ff582e80723ac7ec5051d14b6b86cb7a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 31 Aug 2022 16:29:33 +0100 Subject: [PATCH 20/59] concat columns instead of setting values directly --- pgscatalog_utils/scorefile/liftover.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 8dfcdd6..45258b1 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -30,7 +30,8 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st else: lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) # loads chain files logger.debug("Lifting over scoring files") - to_liftover[['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1) + lifted: pd.DataFrame = to_liftover.apply(_convert_coordinates, axis=1, lo_dict=lo) + to_liftover = pd.concat([to_liftover, lifted], axis=1) logger.debug("Liftover complete") mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] @@ -65,6 +66,8 @@ def _check_min_liftover(mapped: pd.DataFrame, unmapped: pd.DataFrame, min_lift: def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver]) -> pd.Series: """ Convert genomic coordinates to different build """ + converted: list[tuple[str, int, str, int]] | None + if df[['chr_name', 'chr_position']].isnull().values.any(): converted = None else: @@ -72,14 +75,14 @@ def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver]) chrom: str = 'chr' + str(df['chr_name']) pos: int = int(df['chr_position']) - 1 # liftOver is 0 indexed, VCF is 1 indexed # converted example: [('chr22', 15460378, '+', 3320966530)] or None - converted: list[tuple[str, int, str, int] | None] = lo.convert_coordinate(chrom, pos) + converted = lo.convert_coordinate(chrom, pos) if converted: lifted_chrom: str = _parse_lifted_chrom(converted[0][0][3:]) # return first matching liftover lifted_pos: int = int(converted[0][1]) + 1 # reverse 0 indexing - return pd.Series([lifted_chrom, lifted_pos]) + return pd.Series([lifted_chrom, lifted_pos], index=['lifted_chr', 'lifted_pos']) else: - return pd.Series([None, None]) + return pd.Series([None, None], index=['lifted_chr', 'lifted_pos']) def _parse_lifted_chrom(i: str) -> str: From e2e63f94cccb374e809015ab6c84e8ab4d25cdb8 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 31 Aug 2022 16:32:33 +0100 Subject: [PATCH 21/59] remove unused functions --- pgscatalog_utils/scorefile/genome_build.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py index 06c3141..7ea4f09 100644 --- a/pgscatalog_utils/scorefile/genome_build.py +++ b/pgscatalog_utils/scorefile/genome_build.py @@ -2,8 +2,6 @@ import pandas as pd -from pgscatalog_utils.scorefile.read import _read_header - logger = logging.getLogger(__name__) @@ -24,19 +22,3 @@ def build2GRC(build): return None else: return build_2_GRC_dict.get(build) - - -def _read_build(path: str) -> dict[str, str]: - """ Open scorefiles and automatically handle compressed input """ - logger.debug(f'Reading header of {path}') - h = _read_header(path) - return {k: h.get(k, None) for k in ('genome_build', 'HmPOS_build')} - - -def _get_builds(paths: list) -> pd.DataFrame: - """ Get genome builds for a series of scorefile paths - | filename | -> | | genome_build | HmPOS_build | - | x.txt.gz | | x.txt.gz | hg19 | None | - | x_hmPOS_GRCh37.txt.gz | | x_hmPOS_GRCh37.txt.gz | hg19 | GRCh37 | - """ - return pd.DataFrame.from_dict({path: _read_build(path) for path in paths}, orient='index') From d7168e4652938a128712dae1aa70781bc0d8853e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 1 Sep 2022 09:15:48 +0100 Subject: [PATCH 22/59] Update combine_scorefiles.py --- pgscatalog_utils/scorefile/combine_scorefiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 6efeb51..2f4cdd1 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -43,7 +43,7 @@ def combine_scorefiles(): raise Exception # Process/QC score and check variant columns - score = (score.pipe(remap_harmonised, use_harmonised=True) + score = (score.pipe(remap_harmonised, use_harmonised=use_harmonised) .pipe(quality_control, drop_missing=args.drop_missing) .pipe(melt_effect_weights) .pipe(set_effect_type)) From f75d401b7506081e01cc5d7506c2c1f8a02f739c Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Thu, 1 Sep 2022 10:25:54 +0100 Subject: [PATCH 23/59] Make genome build a required header item for combine_scorefiles --- pgscatalog_utils/scorefile/combine_scorefiles.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 2f4cdd1..5b30fda 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -51,6 +51,11 @@ def combine_scorefiles(): # Annotate score with the genome_build (in GRCh notation) if current_build is None: current_build = build2GRC(h.get('genome_build')) + if current_build is None: + logger.error("Scorefile has no build information, " + "please add the build to the header with " + "('#genome_build=[insert variant build]") + raise Exception score = score.assign(genome_build=current_build) From a28f4c4b0c821f4d68f0199e75b0bc82525e5fc3 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 1 Sep 2022 14:59:28 +0100 Subject: [PATCH 24/59] bump version --- Dockerfile | 4 ++-- pgscatalog_utils/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9e97be8..8c19690 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,8 @@ FROM python:3.10 WORKDIR /opt/ -COPY --from=builder /app/dist/pgscatalog_utils-0.1.1-py3-none-any.whl . +COPY --from=builder /app/dist/pgscatalog_utils-0.1.2-py3-none-any.whl . -RUN pip install pgscatalog_utils-0.1.1-py3-none-any.whl +RUN pip install pgscatalog_utils-0.1.2-py3-none-any.whl RUN apt-get update && apt-get install -y sqlite3 \ No newline at end of file diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py index df9144c..10939f0 100644 --- a/pgscatalog_utils/__init__.py +++ b/pgscatalog_utils/__init__.py @@ -1 +1 @@ -__version__ = '0.1.1' +__version__ = '0.1.2' diff --git a/pyproject.toml b/pyproject.toml index 44ef233..acfcb36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pgscatalog_utils" -version = "0.1.1" +version = "0.1.2" description = "Utilities for working with PGS Catalog API and scoring files" homepage = "https://github.com/PGScatalog/pgscatalog_utils" authors = ["Benjamin Wingfield ", "Samuel Lambert "] From b982ce669cdbaf7f8454eaffdc56b8f5d47f2c09 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 5 Sep 2022 16:18:54 +0100 Subject: [PATCH 25/59] fix _prioritise_match_type() with flipped match strategies --- pgscatalog_utils/match/postprocess.py | 50 +++++++++------------------ 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index 71da4b6..fae0b0d 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -54,26 +54,21 @@ def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFr :param drop_duplicates: If it's impossible to make match candidates unique, drop all candidates? :return: A dataframe containing the best match candidate for each variant """ - - dups: pl.DataFrame = _get_duplicate_variants(df) + logger.debug("First match pruning: prioritise by match types") + prioritised = _prioritise_match_type(df) + singletons: pl.DataFrame = _get_singleton_variants(prioritised) + dups: pl.DataFrame = _get_duplicate_variants(prioritised) if dups: - logger.debug("First match pruning: prioritise by match types") - singletons: pl.DataFrame = _get_singleton_variants(df) - prioritised: pl.DataFrame = _prioritise_match_type(dups) - prioritised_dups: pl.DataFrame = _get_duplicate_variants(prioritised) - if prioritised_dups and not keep_first_match: - logger.debug("Final match pruning: dropping remaining duplicate matches") - distinct: pl.DataFrame = pl.concat([singletons, _get_singleton_variants(prioritised)]) - elif prioritised_dups and keep_first_match: + if keep_first_match: logger.debug("Final match pruning: keeping first match") - distinct: pl.DataFrame = pl.concat([singletons, _get_singleton_variants(prioritised), - prioritised.unique(maintain_order=True)]) + distinct: pl.DataFrame = pl.concat([singletons, dups.unique(maintain_order=True)]) else: - logger.debug("Final match pruning unnecessary") - distinct: pl.DataFrame = pl.concat([singletons, prioritised]) + logger.debug("Final match pruning: dropping remaining duplicate matches") + distinct: pl.DataFrame = singletons else: - distinct: pl.DataFrame = df + logger.debug("Final match pruning unnecessary") + distinct: pl.DataFrame = singletons assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant" logger.debug("Match pruning complete") @@ -98,34 +93,23 @@ def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame: def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame: - dup_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") != None) - dup_no_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") == None) - best_matches: list[pl.DataFrame] = [] - - if dup_oa: - match_priority: list[str] = ['refalt', 'altref', 'refalt_flip', 'altref_flip'] - logger.debug(f"Prioritising matches in order {match_priority}") - best_matches.append(_get_best_match(dup_oa, match_priority)) - - if dup_no_oa: - match_priority: list[str] = ['no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip', 'no_oa_alt_flip'] - logger.debug(f"Prioritising matches in order {match_priority}") - best_matches.append(_get_best_match(dup_no_oa, match_priority)) - - return pl.concat(best_matches) + # first element has the highest priority and last element has the lowest priority + match_priority = ['refalt', 'altref', 'refalt_flip', 'altref_flip', 'no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip', + 'no_oa_alt_flip'] + return _get_best_match(duplicates, match_priority) def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame: match: list[pl.DataFrame] = [] for match_type in match_priority: + logger.debug(f"Selecting matches with match type {match_type}") match.append(df.filter(pl.col("match_type") == match_type)) - logger.debug("Filtering best match types") + logger.debug("Prioritising match types (refalt > altref > ...)") return reduce(lambda x, y: _join_best_match(x, y), match) def _join_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame: # variants in dataframe x have a higher priority than dataframe y # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x - not_in: pl.DataFrame = y.join(x, how='anti', - on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele']) + not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'ID']) return pl.concat([x, not_in]) From 8795a5806e5f5a6218420917611c43678cd85290 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 5 Sep 2022 16:51:25 +0100 Subject: [PATCH 26/59] add other_allele to _get_singleton_variants and _get_duplicate_variants --- pgscatalog_utils/match/postprocess.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index fae0b0d..ba42378 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -78,18 +78,18 @@ def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFr def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame: """ Return variants with only one row (match candidate) per variant ID """ - return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']) + return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele']) .count() - .filter(pl.col('count') == 1)[:, "accession":"effect_allele"] - .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) + .filter(pl.col('count') == 1)[:, "accession":"other_allele"] + .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'], how='left')) def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame: """ Return variants with more than one row (match candidate) per variant ID """ - return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']) + return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele']) .count() - .filter(pl.col('count') > 1)[:, "accession":"effect_allele"] - .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left')) + .filter(pl.col('count') > 1)[:, "accession":"other_allele"] + .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'], how='left')) def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame: From 35a08d9d0f6be9230bad8615d326e8b1fdbf030b Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 5 Sep 2022 17:29:27 +0100 Subject: [PATCH 27/59] Readability/doc-edits --- pgscatalog_utils/match/postprocess.py | 36 +++++++++++++-------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index ba42378..b7e14b0 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -41,23 +41,22 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: .otherwise(False))) -def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFrame: +def _prune_matches(df: pl.DataFrame, keep_first_match: bool = False) -> pl.DataFrame: """ Select the best match candidate in the target for each variant in the scoring file - - In a scoring file (accession), each variant ID with the same effect allele and weight *must be unique* - The variant matching process normally returns multiple match candidates for each variant ID, e.g.: refalt > altref > refalt_flip > altref_flip - When multiple match candidates for an ID exist, they must be prioritised and pruned to be unique - If it's impossible to prioritise match candidates (i.e. same strategy is used), drop all matches by default + - In a scoring file (accession), each variant ID *must be unique* (have only one weight and effect_allele) :param df: A dataframe containing multiple match candidates for each variant - :param drop_duplicates: If it's impossible to make match candidates unique, drop all candidates? + :param keep_first_match: If it's impossible to make match candidates unique, keep the first occuring variant? :return: A dataframe containing the best match candidate for each variant """ logger.debug("First match pruning: prioritise by match types") prioritised = _prioritise_match_type(df) - singletons: pl.DataFrame = _get_singleton_variants(prioritised) - dups: pl.DataFrame = _get_duplicate_variants(prioritised) + singletons, dups = _divide_matches(prioritised) if dups: if keep_first_match: @@ -70,26 +69,25 @@ def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFr logger.debug("Final match pruning unnecessary") distinct: pl.DataFrame = singletons - assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant" + # Final QC check + u_counts = distinct.groupby(['accession', 'ID']).count() + assert all(u_counts['count'] == 1), "Duplicate effect weights for a variant: {}".format(list(u_counts['accession'].unique())) + logger.debug("Match pruning complete") return distinct.with_column(pl.lit(True).alias('passes_pruning')) -def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame: - """ Return variants with only one row (match candidate) per variant ID """ - return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele']) - .count() - .filter(pl.col('count') == 1)[:, "accession":"other_allele"] - .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'], how='left')) - +def _divide_matches(df: pl.DataFrame) -> tuple [ pl.DataFrame, pl.DataFrame ]: + """ Divide score file match candidates with only one row (unique) vs. multiple (duplicates)""" + join_cols = ['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'] + counted = df.groupby(join_cols).count() + singletons = (counted.filter(pl.col('count') == 1)[:, "accession":"other_allele"] + .join(df, on=join_cols, how='left')) + duplicates = (counted.filter(pl.col('count') > 1)[:, "accession":"other_allele"] + .join(df, on=join_cols, how='left')) -def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame: - """ Return variants with more than one row (match candidate) per variant ID """ - return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele']) - .count() - .filter(pl.col('count') > 1)[:, "accession":"other_allele"] - .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'], how='left')) + return singletons, duplicates def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame: From 356a479661839470802bbf3676bc7fd8f154b604 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 5 Sep 2022 18:21:47 +0100 Subject: [PATCH 28/59] Number lines w/in an accession --- pgscatalog_utils/scorefile/read.py | 5 ++--- pgscatalog_utils/scorefile/write.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py index d5c2b39..14cb52d 100644 --- a/pgscatalog_utils/scorefile/read.py +++ b/pgscatalog_utils/scorefile/read.py @@ -10,10 +10,9 @@ def load_scorefile(path: str) -> tuple[dict, pd.DataFrame]: logger.debug(f'Reading scorefile {path}') + df = pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) return (_read_header(path), - pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) - .assign(filename_prefix=_get_basename(path), - filename=path)) + df.assign(filename_prefix=_get_basename(path), filename=path, row_nr=df.index)) def _read_header(path: str) -> dict: diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 3f23830..0dd7b38 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -7,7 +7,7 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None: cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', - 'is_duplicated', 'accession'] + 'is_duplicated', 'accession', 'row_nr'] if df.empty: logger.error("Empty scorefile output! Please check the input data") From 2503cb62407b3edde0993d918289676ae959f3c1 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 5 Sep 2022 18:34:19 +0100 Subject: [PATCH 29/59] Use row_nr to priortise matches, and ID to de-duplicate scoring files --- pgscatalog_utils/match/match.py | 2 +- pgscatalog_utils/match/postprocess.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index ac566b2..6da2e9c 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -15,7 +15,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) matches: list[pl.DataFrame] = [] - col_order = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', + col_order = ['row_nr', 'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession', 'effect_allele_FLIP', 'other_allele_FLIP', 'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type'] diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index b7e14b0..7676902 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -79,22 +79,22 @@ def _prune_matches(df: pl.DataFrame, keep_first_match: bool = False) -> pl.DataF def _divide_matches(df: pl.DataFrame) -> tuple [ pl.DataFrame, pl.DataFrame ]: - """ Divide score file match candidates with only one row (unique) vs. multiple (duplicates)""" - join_cols = ['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'] + """ Divide scorefile (accession) matches with only one ID match (singletons) vs. multiple (duplicates)""" + join_cols = ['accession', 'ID'] counted = df.groupby(join_cols).count() - singletons = (counted.filter(pl.col('count') == 1)[:, "accession":"other_allele"] + singletons = (counted.filter(pl.col('count') == 1)[:, join_cols] .join(df, on=join_cols, how='left')) - duplicates = (counted.filter(pl.col('count') > 1)[:, "accession":"other_allele"] + duplicates = (counted.filter(pl.col('count') > 1)[:, join_cols] .join(df, on=join_cols, how='left')) return singletons, duplicates -def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame: - # first element has the highest priority and last element has the lowest priority +def _prioritise_match_type(all_matches: pl.DataFrame) -> pl.DataFrame: + # Select best match for each row in the scoring file match_priority = ['refalt', 'altref', 'refalt_flip', 'altref_flip', 'no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip', 'no_oa_alt_flip'] - return _get_best_match(duplicates, match_priority) + return _get_best_match(all_matches, match_priority) def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame: @@ -109,5 +109,5 @@ def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame def _join_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame: # variants in dataframe x have a higher priority than dataframe y # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x - not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'ID']) + not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'row_nr']) return pl.concat([x, not_in]) From 13814b53d4baa61c259a31c6227d42a8d5a2b8f0 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 6 Sep 2022 16:56:23 +0100 Subject: [PATCH 30/59] compress log output --- pgscatalog_utils/match/log.py | 19 +++++++++++++++++++ pgscatalog_utils/match/write.py | 4 ---- 2 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 pgscatalog_utils/match/log.py diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py new file mode 100644 index 0000000..13085cb --- /dev/null +++ b/pgscatalog_utils/match/log.py @@ -0,0 +1,19 @@ +import gzip +import logging + +import polars as pl + +logger = logging.getLogger(__name__) + + +def write_log(df: pl.DataFrame, dataset: str) -> None: + logger.debug("Compressing and writing log") + with gzip.open(f"{dataset}_log.csv.gz", 'wb') as f: + df.pipe(_prettify_log).write_csv(f) + + +def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: + keep_cols = ["chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type", + "accession", "row_nr", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", + "ambiguous", "duplicate", "best_match", "dataset", "score_pass", "match_rate"] + return df.select(keep_cols).select(pl.exclude("^.*_right")) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 1935bd5..7a8a880 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -21,10 +21,6 @@ def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: [_write_scorefile(ea_dict.get(k), v, split, outdir, dataset) for k, v in deduplicated.items()] -def write_log(df: pl.DataFrame, dataset: str) -> None: - df.write_csv(f"{dataset}_log.csv") - - def _write_scorefile(effect_type: str, scorefiles: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: """ Write a list of scorefiles with the same effect type """ # each list element contains a dataframe of variants From 724144d464e1ed4628fd73e0389153cbad36061a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 6 Sep 2022 17:01:10 +0100 Subject: [PATCH 31/59] refactor filtering score and variants to filter module --- pgscatalog_utils/match/filter.py | 91 ++++++++++++++++++++++ pgscatalog_utils/match/match.py | 46 +---------- pgscatalog_utils/match/match_variants.py | 36 ++++----- pgscatalog_utils/match/postprocess.py | 97 +++++++++--------------- tests/test_match.py | 14 ++-- 5 files changed, 153 insertions(+), 131 deletions(-) create mode 100644 pgscatalog_utils/match/filter.py diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py new file mode 100644 index 0000000..caa9360 --- /dev/null +++ b/pgscatalog_utils/match/filter.py @@ -0,0 +1,91 @@ +import logging + +import polars as pl + +from pgscatalog_utils.match.log import write_log + +logger = logging.getLogger(__name__) + + +def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool, + min_overlap: float, dataset: str) -> pl.DataFrame: + """ Remove scores that don't match well """ + scorefile: pl.DataFrame = scorefile.with_columns([ + pl.col('effect_type').cast(pl.Categorical), + pl.col('accession').cast(pl.Categorical)]) # same dtypes for join + + # matches may contain more than one row per variant in the scoring file + # e.g., one ambiguous match and one clear match, or duplicates may be in the scoring file + filtered_matches: pl.DataFrame = _filter_matches(matches, remove_ambiguous, keep_first_match) + match_log: pl.DataFrame = _join_matches(filtered_matches, scorefile, dataset) + match_log['best_match'] = match_log['best_match'].fill_null(False) + + fail_rates: pl.DataFrame = _calculate_match_rate(match_log) + + scores: list[pl.DataFrame] = [] + for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()): + if rate < (1 - min_overlap): + df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [True], 'match_rate': [1 - rate]}) + logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%} variants match)") + scores.append(df) + else: + df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [False], 'match_rate': [1 - rate]}) + logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)") + scores.append(df) + + (match_log.with_column(pl.col('accession').cast(str)) + .join(pl.concat(scores), on='accession', how='left')).pipe(write_log, dataset) # write log to gzipped CSV + + return (filtered_matches.with_column(pl.col('accession').cast(str)) + .join(pl.concat(scores), on='accession', how='left')) + + +def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame: + logger.debug("Calculating overlap between target genome and scoring file") + return (df.groupby('accession') + .agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')]) + .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate'))) + + +def _filter_matches(df: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: + logger.debug("Final match candidate filtering") + return (df.filter(pl.col('best_match') == True) + .pipe(_handle_ambiguous, remove_ambiguous) + .pipe(_handle_duplicates, keep_first_match)) + + +def _handle_ambiguous(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame: + if remove_ambiguous: + logger.debug("Filtering: Removing ambiguous matches") + return df.filter(pl.col("ambiguous") == False) + else: + logger.debug("Filtering: Keeping best possible match from ambiguous matches") + ambiguous: pl.DataFrame = df.filter((pl.col("ambiguous") == True) & \ + (pl.col("match_type").str.contains('flip').is_not())) + unambiguous: pl.DataFrame = df.filter(pl.col("ambiguous") == False) + return pl.concat([ambiguous, unambiguous]) + + +def _handle_duplicates(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame: + singletons = df.filter(pl.col('duplicate') == False) + if keep_first_match: + logger.debug("Filtering: keeping first match") + first = (df.filter(pl.col('duplicate') == True) + .groupby(["accession", "ID"]) + .agg([pl.col("row_nr").first()]) + .join(df, on=['accession', 'row_nr'], how='left')) + return pl.concat([singletons, first.select(singletons.columns)]) + else: + logger.debug("Filtering: dropping any duplicate matches") + return singletons + + +def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame: + return (scorefile.join(matches, on=['accession', 'row_nr'], how='left') + .with_column(pl.lit(dataset).alias('dataset')) + .select(pl.exclude("^.*_right$"))) + + +def _match_keys() -> list[str]: + return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', + 'accession', 'effect_type', 'effect_weight'] diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 6da2e9c..9387146 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -3,13 +3,11 @@ import polars as pl from pgscatalog_utils.match.postprocess import postprocess_matches -from pgscatalog_utils.match.write import write_log logger = logging.getLogger(__name__) -def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool, - skip_flip: bool, keep_first_match: bool) -> pl.DataFrame: +def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool) -> pl.DataFrame: scorefile_cat, target_cat = _cast_categorical(scorefile, target) scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) @@ -35,47 +33,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order)) matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order)) - return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous, keep_first_match) - - -def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> pl.DataFrame: - scorefile: pl.DataFrame = scorefile.with_columns([ - pl.col('effect_type').cast(pl.Categorical), - pl.col('accession').cast(pl.Categorical)]) # same dtypes for join - match_log: pl.DataFrame = _join_matches(matches, scorefile, dataset) - fail_rates: pl.DataFrame = (match_log.groupby('accession') - .agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')]) - .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate')) - ) - pass_df: pl.DataFrame = pl.DataFrame() - for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()): - if rate < (1 - min_overlap): - df = pl.DataFrame({'accession': [accession], 'match_pass': [True], 'match_rate': [1 - rate]}) - pass_df = pl.concat([pass_df, df]) - logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%} variants match)") - else: - df = pl.DataFrame({'accession': [accession], 'match_pass': [False], 'match_rate': [1 - rate]}) - pass_df = pl.concat([pass_df, df]) - logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)") - - # TODO: fill nulls in certain columns with false in a nicer way - match_log['passes_pruning'] = match_log['passes_pruning'].fill_null(False) - - # add match statistics to log and matches - write_log((match_log.with_column(pl.col('accession').cast(str)) - .join(pass_df, on='accession', how='left')), dataset) - - return (matches.with_column(pl.col('accession').cast(str)) - .join(pass_df, on='accession', how='left')) - - -def _match_keys(): - return ['chr_name', 'chr_position', 'effect_allele', 'other_allele', - 'accession', 'effect_type', 'effect_weight'] - - -def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str): - return scorefile.join(matches, on=_match_keys(), how='left').with_column(pl.lit(dataset).alias('dataset')) + return pl.concat(matches).pipe(postprocess_matches) def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 5a67dd0..6937a90 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -6,7 +6,8 @@ import polars as pl from pgscatalog_utils.log_config import set_logging_level -from pgscatalog_utils.match.match import get_all_matches, check_match_rate +from pgscatalog_utils.match.match import get_all_matches +from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.read import read_target, read_scorefile from pgscatalog_utils.match.write import write_out @@ -35,29 +36,28 @@ def match_variants(): match match_mode: case "single": logger.debug(f"Match mode: {match_mode}") - matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous, - args.skip_flip, args.keep_first_match) + matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.skip_flip) case "multi": logger.debug(f"Match mode: {match_mode}") matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, - args.remove_ambiguous, args.skip_flip, args.keep_first_match) + args.skip_flip) case "fast": logger.debug(f"Match mode: {match_mode}") matches = _fast_match(args.target, scorefile, args.remove_multiallelic, - args.remove_ambiguous, args.skip_flip, args.keep_first_match) + args.skip_flip) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator - valid_matches: pl.DataFrame = (check_match_rate(scorefile, matches, args.min_overlap, dataset) - .filter(pl.col('match_pass') == True)) + valid_matches = filter_scores(scorefile, matches, args.remove_ambiguous, args.keep_first_match, args.min_overlap, + dataset) - if valid_matches.is_empty(): # this can happen if args.min_overlap = 0 - logger.error("Error: no target variants match any variants in scoring files") - raise Exception + if valid_matches.is_empty(): # this can happen if args.min_overlap = 0 + logger.error("Error: no target variants match any variants in scoring files") + raise Exception - write_out(valid_matches, args.split, args.outdir, dataset) + write_out(valid_matches, args.split, args.outdir, dataset) def _check_target_chroms(target) -> None: @@ -70,37 +70,37 @@ def _check_target_chroms(target) -> None: def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame: + skip_filp: bool) -> pl.DataFrame: # fast match is fast because: # 1) all target files are read into memory # 2) matching occurs without iterating through chromosomes target: pl.DataFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic) logger.debug("Split target chromosomes not checked with fast match mode") - return get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match) + return get_all_matches(scorefile, target, skip_filp) def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame: + skip_filp: bool) -> pl.DataFrame: matches = [] for i, loc_target_current in enumerate(glob(target_path)): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.DataFrame = read_target(path=loc_target_current, - remove_multiallelic=remove_multiallelic) # + remove_multiallelic=remove_multiallelic) _check_target_chroms(target) - matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match)) + matches.append(get_all_matches(scorefile, target, skip_filp)) return pl.concat(matches) def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame: + skip_filp: bool) -> pl.DataFrame: matches = [] for chrom in scorefile['chr_name'].unique().to_list(): target = read_target(target_path, remove_multiallelic=remove_multiallelic, single_file=True, chrom=chrom) # scans and filters if target: logger.debug(f"Matching chromosome {chrom}") - matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match)) + matches.append(get_all_matches(scorefile, target, skip_filp)) return pl.concat(matches) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py index 7676902..0ae0b2b 100644 --- a/pgscatalog_utils/match/postprocess.py +++ b/pgscatalog_utils/match/postprocess.py @@ -8,24 +8,17 @@ logger = logging.getLogger(__name__) -def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: - """ Clean up match candidates ready for writing out, including: +def postprocess_matches(df: pl.DataFrame) -> pl.DataFrame: + """ Label match candidates with additional metadata. Column definitions: - - Label ambiguous variants - - Prune match candidates to select the best match for each variant in the scoring file - - Optionally remove ambiguous variants + - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function) + - best_match: True if row is the best possible match type (refalt > altref > ...) + - duplicate: True if >1 scoring file line matches to the same variant ID + - ambiguous: True if ambiguous """ - df = _label_biallelic_ambiguous(df).pipe(_prune_matches, keep_first_match) - - if remove_ambiguous: - logger.debug("Removing ambiguous matches") - return df.filter(pl.col("ambiguous") == False) - else: - logger.debug("Keeping best possible match from ambiguous matches") - ambiguous: pl.DataFrame = df.filter((pl.col("ambiguous") == True) & \ - (pl.col("match_type").str.contains('flip').is_not())) - unambiguous: pl.DataFrame = df.filter(pl.col("ambiguous") == False) - return pl.concat([ambiguous, unambiguous]) + return (df.with_column(pl.lit(True).alias('match_candidate')) + .pipe(_label_biallelic_ambiguous) + .pipe(_label_pruned_matches)) def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: @@ -41,73 +34,53 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: .otherwise(False))) -def _prune_matches(df: pl.DataFrame, keep_first_match: bool = False) -> pl.DataFrame: - """ Select the best match candidate in the target for each variant in the scoring file - - - The variant matching process normally returns multiple match candidates for each variant ID, e.g.: - refalt > altref > refalt_flip > altref_flip - - When multiple match candidates for an ID exist, they must be prioritised and pruned to be unique - - If it's impossible to prioritise match candidates (i.e. same strategy is used), drop all matches by default - - In a scoring file (accession), each variant ID *must be unique* (have only one weight and effect_allele) - - :param df: A dataframe containing multiple match candidates for each variant - :param keep_first_match: If it's impossible to make match candidates unique, keep the first occuring variant? - :return: A dataframe containing the best match candidate for each variant - """ - logger.debug("First match pruning: prioritise by match types") - prioritised = _prioritise_match_type(df) - singletons, dups = _divide_matches(prioritised) +def _label_pruned_matches(df: pl.DataFrame) -> pl.DataFrame: + best_matches = (df.pipe(_label_best_match) + .pipe(_label_duplicates)) - if dups: - if keep_first_match: - logger.debug("Final match pruning: keeping first match") - distinct: pl.DataFrame = pl.concat([singletons, dups.unique(maintain_order=True)]) - else: - logger.debug("Final match pruning: dropping remaining duplicate matches") - distinct: pl.DataFrame = singletons - else: - logger.debug("Final match pruning unnecessary") - distinct: pl.DataFrame = singletons + # check that duplicates were correctly labelled + u_counts = best_matches.filter(pl.col('duplicate') == False).groupby(['accession', 'ID']).count() + assert (u_counts['count'] == 1).all(), \ + "Duplicate effect weights for a variant: {}".format(list(u_counts['accession'].unique())) - # Final QC check - u_counts = distinct.groupby(['accession', 'ID']).count() - assert all(u_counts['count'] == 1), "Duplicate effect weights for a variant: {}".format(list(u_counts['accession'].unique())) + labelled = (df.join(best_matches, how='left', on=['row_nr', 'accession', 'ID']) + .select(pl.exclude("^.*_right$"))) + assert labelled.shape[0] == df.shape[0] # don't want to lose any rows from the input df - logger.debug("Match pruning complete") + return labelled - return distinct.with_column(pl.lit(True).alias('passes_pruning')) +def _label_duplicates(df: pl.DataFrame) -> pl.DataFrame: + """ Label scorefile (accession) matches with only one ID match (singletons) vs. multiple (duplicates)""" + logger.debug('Labelling multiple accession - ID rows as duplicates') -def _divide_matches(df: pl.DataFrame) -> tuple [ pl.DataFrame, pl.DataFrame ]: - """ Divide scorefile (accession) matches with only one ID match (singletons) vs. multiple (duplicates)""" join_cols = ['accession', 'ID'] counted = df.groupby(join_cols).count() singletons = (counted.filter(pl.col('count') == 1)[:, join_cols] - .join(df, on=join_cols, how='left')) + .join(df, on=join_cols, how='left') + .with_column(pl.lit(False).alias('duplicate'))) duplicates = (counted.filter(pl.col('count') > 1)[:, join_cols] - .join(df, on=join_cols, how='left')) + .join(df, on=join_cols, how='left') + .with_column(pl.lit(True).alias('duplicate'))) - return singletons, duplicates + return pl.concat([singletons, duplicates]) -def _prioritise_match_type(all_matches: pl.DataFrame) -> pl.DataFrame: - # Select best match for each row in the scoring file +def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: match_priority = ['refalt', 'altref', 'refalt_flip', 'altref_flip', 'no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip', 'no_oa_alt_flip'] - return _get_best_match(all_matches, match_priority) - - -def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame: match: list[pl.DataFrame] = [] for match_type in match_priority: logger.debug(f"Selecting matches with match type {match_type}") match.append(df.filter(pl.col("match_type") == match_type)) - logger.debug("Prioritising match types (refalt > altref > ...)") - return reduce(lambda x, y: _join_best_match(x, y), match) + + logger.debug("Labelling best match type (refalt > altref > ...)") + best_match: pl.DataFrame = reduce(lambda x, y: _prioritise_best_match(x, y), match) + return best_match.with_column(pl.lit(True).alias('best_match')) -def _join_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame: +def _prioritise_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame: # variants in dataframe x have a higher priority than dataframe y # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x - not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'row_nr']) + not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'ID', 'row_nr']) return pl.concat([x, not_in]) diff --git a/tests/test_match.py b/tests/test_match.py index 70b9671..0dc9a2a 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -46,14 +46,14 @@ def test_match_strategies(small_scorefile, small_target): scorefile, target = _cast_cat(small_scorefile, small_target) # check unambiguous matches - df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True, keep_first_match=False) + df = get_all_matches(scorefile, target, skip_flip=True) assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'}) assert set(df['match_type'].to_list()).issubset(['altref', 'refalt']) # when keeping ambiguous and flipping alleles: # 2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip' # flipped matches should be dropped for ambiguous matches - flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False)\ + flip = (get_all_matches(scorefile, target, skip_flip=False)\ .filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) @@ -63,13 +63,13 @@ def test_match_strategies(small_scorefile, small_target): def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) - df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True, keep_first_match=False) + df = get_all_matches(scorefile, target, skip_flip=True) assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref']) # one of the matches is ambiguous - flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False) + flip = (get_all_matches(scorefile, target, skip_flip=False) .filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'no_oa_alt'}) @@ -78,14 +78,14 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): def test_flip_match(small_flipped_scorefile, small_target): scorefile, target = _cast_cat(small_flipped_scorefile, small_target) - df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True, keep_first_match=False) + df = get_all_matches(scorefile, target, skip_flip=True) assert df.is_empty() - flip = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=False, keep_first_match=False) + flip = get_all_matches(scorefile, target, skip_flip=False) assert flip['match_type'].str.contains('flip').all() assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) - flip_ambig = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False) + flip_ambig = (get_all_matches(scorefile, target, skip_flip=False) .filter(pl.col('ambiguous') == True)) assert not flip_ambig['match_type'].str.contains('flip').any() # no flip matches for ambiguous From 98b34873ecb25dfc6f345a2ca7792719e49afecd Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 6 Sep 2022 17:14:42 +0100 Subject: [PATCH 32/59] fix tests --- tests/test_combine.py | 2 +- tests/test_match.py | 27 +++++++++++---------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/tests/test_combine.py b/tests/test_combine.py index ae7de87..8be71c2 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -11,7 +11,7 @@ def test_combine_scorefiles(combined_scorefile, _n_variants): df = pd.read_table(combined_scorefile) cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', - 'is_duplicated', 'accession'} + 'is_duplicated', 'accession', 'row_nr'} assert set(df.columns).issubset(cols) assert df.shape[0] == _n_variants diff --git a/tests/test_match.py b/tests/test_match.py index 0dc9a2a..42d0e87 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -46,53 +46,48 @@ def test_match_strategies(small_scorefile, small_target): scorefile, target = _cast_cat(small_scorefile, small_target) # check unambiguous matches - df = get_all_matches(scorefile, target, skip_flip=True) + df = get_all_matches(scorefile, target, skip_flip=True).filter(pl.col('ambiguous') == False) assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'}) assert set(df['match_type'].to_list()).issubset(['altref', 'refalt']) - # when keeping ambiguous and flipping alleles: - # 2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip' - # flipped matches should be dropped for ambiguous matches - flip = (get_all_matches(scorefile, target, skip_flip=False)\ - .filter(pl.col('ambiguous') == True)) + # when keeping ambiguous and flipping alleles + flip = (get_all_matches(scorefile, target, skip_flip=False).filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) - assert set(flip['match_type'].to_list()).issubset({'altref'}) + assert set(flip['match_type'].to_list()).issubset({'altref', 'refalt_flip'}) def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) - df = get_all_matches(scorefile, target, skip_flip=True) + df = get_all_matches(scorefile, target, skip_flip=True).filter(pl.col('ambiguous') == False) assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref']) - # one of the matches is ambiguous + # check ambiguous matches flip = (get_all_matches(scorefile, target, skip_flip=False) .filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) - assert set(flip['match_type'].to_list()).issubset({'no_oa_alt'}) + assert set(flip['match_type'].to_list()).issubset({'no_oa_alt', 'no_oa_ref_flip'}) def test_flip_match(small_flipped_scorefile, small_target): scorefile, target = _cast_cat(small_flipped_scorefile, small_target) df = get_all_matches(scorefile, target, skip_flip=True) - assert df.is_empty() + assert set(df['ambiguous']) == {True} + assert set(df['match_type']) == {'refalt'} - flip = get_all_matches(scorefile, target, skip_flip=False) + flip = get_all_matches(scorefile, target, skip_flip=False).filter(pl.col('ambiguous') == False) assert flip['match_type'].str.contains('flip').all() assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) - flip_ambig = (get_all_matches(scorefile, target, skip_flip=False) - .filter(pl.col('ambiguous') == True)) - assert not flip_ambig['match_type'].str.contains('flip').any() # no flip matches for ambiguous - @pytest.fixture def small_scorefile(): df = pl.DataFrame({"accession": ["test", "test", "test"], + "row_nr": [1, 2, 3], "chr_name": [1, 2, 3], "chr_position": [1, 2, 3], "effect_allele": ["A", "A", "G"], From 856c7bc65f3a194e93253024ba1c089a226f9e44 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 16:38:12 +0100 Subject: [PATCH 33/59] stop using joins with labelling, because variants went missing --- pgscatalog_utils/match/label.py | 95 ++++++++++++++++++++++++ pgscatalog_utils/match/match.py | 7 +- pgscatalog_utils/match/match_variants.py | 36 +++++---- pgscatalog_utils/match/postprocess.py | 86 --------------------- 4 files changed, 120 insertions(+), 104 deletions(-) create mode 100644 pgscatalog_utils/match/label.py delete mode 100644 pgscatalog_utils/match/postprocess.py diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py new file mode 100644 index 0000000..4291fb6 --- /dev/null +++ b/pgscatalog_utils/match/label.py @@ -0,0 +1,95 @@ +import logging + +import polars as pl + +from pgscatalog_utils.match.preprocess import complement_valid_alleles + +logger = logging.getLogger(__name__) + + +def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.DataFrame: + """ Label match candidates with additional metadata. Column definitions: + + - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function) + - best_match: True if row is the best possible match type (refalt > altref > ...) + - duplicate: True if more than one best match exists for the same accession and ID + - ambiguous: True if ambiguous + """ + return (df.with_column(pl.lit(True).alias('match_candidate')) + .pipe(_label_biallelic_ambiguous, remove_ambiguous) + .pipe(_label_best_match) + .pipe(_label_duplicate_best_match, keep_first_match)) + + +def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame: + logger.debug("Labelling ambiguous variants") + ambig = ((df.with_columns([ + pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), + pl.lit(True).alias("ambiguous")]) + .pipe(complement_valid_alleles, ["REF"])) + .with_column(pl.when(pl.col("REF_FLIP") == pl.col("ALT")) + .then(pl.col("ambiguous")) + .otherwise(False))) + + if remove_ambiguous: + logger.debug("Labelling ambiguous variants with exclude flag") + return ambig.with_column(pl.when(pl.col('ambiguous') == True) + .then(True) + .otherwise(False) + .alias('exclude')) + else: + return ambig.with_column(pl.lit(False).alias('exclude')) + + +def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: + logger.debug("Labelling best match type (refalt > altref > ...)") + match_priority = {'refalt': 0, 'altref': 1, 'refalt_flip': 2, 'altref_flip': 3, 'no_oa_ref': 4, 'no_oa_alt': 5, + 'no_oa_ref_flip': 6, 'no_oa_alt_flip': 7} + match_priority_rev = {v: k for k, v in match_priority.items()} + + # use a groupby aggregation to guarantee the number of rows stays the same + # rows were being lost using an anti join + reduce approach + prioritised: pl.DataFrame = (df.with_column(pl.col('match_type') + .apply(lambda x: match_priority[x]) + .alias('match_priority')) + .with_column(pl.col("match_priority") + .min() + .over(["accession", "row_nr"]) + .apply(lambda x: match_priority_rev[x]) + .alias('best_match_type')) + .with_column(pl.when(pl.col('best_match_type') == pl.col('match_type')) + .then(pl.lit(True)) + .otherwise(pl.lit(False)) + .alias('best_match'))) + assert prioritised.shape[0] == df.shape[0] # I'm watching you, Wazowski. Always watching. Always. + return prioritised.drop(['match_priority', 'best_match_type']) + + +def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match) -> pl.DataFrame: + logger.debug('Labelling duplicated best matches') + duplicates = (df.with_column(pl.col('best_match') + .count() + .over(['accession', 'ID', 'best_match']) + .alias('count')) + .with_column(pl.when(pl.col('count') > 1) + .then(pl.lit(True)) + .otherwise(pl.lit(False)) + .alias('duplicate')) + .drop('count')) + + if keep_first_match: + logger.debug("Keeping first duplicate, labelling others with exclude flag ") + # set first duplicate (with the smallest row_nr) to exclude = false + labelled = duplicates.with_column(pl.when((pl.col("duplicate") == True) & + (pl.col("row_nr") != pl.min("row_nr") + .over(["accession", "ID", "duplicate"]))) + .then(True) + .otherwise(False) + .alias('exclude_duplicate')) + else: + logger.debug("Labelling all duplicates with exclude flag") + labelled = duplicates.with_column(pl.lit(False).alias('exclude_duplicate')) + + # get the horizontal maximum to combine the exclusion columns for each variant + return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"])) + .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"}) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 9387146..677f22a 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -2,12 +2,13 @@ import polars as pl -from pgscatalog_utils.match.postprocess import postprocess_matches +from pgscatalog_utils.match.label import label_matches logger = logging.getLogger(__name__) -def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool) -> pl.DataFrame: +def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool, remove_ambiguous: bool, + keep_first_match: bool) -> pl.DataFrame: scorefile_cat, target_cat = _cast_categorical(scorefile, target) scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) @@ -33,7 +34,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bo matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order)) matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order)) - return pl.concat(matches).pipe(postprocess_matches) + return pl.concat(matches).pipe(label_matches, remove_ambiguous, keep_first_match) def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 6937a90..b6962e4 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -6,10 +6,11 @@ import polars as pl from pgscatalog_utils.log_config import set_logging_level +from pgscatalog_utils.match.log import make_logs from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.read import read_target, read_scorefile -from pgscatalog_utils.match.write import write_out +from pgscatalog_utils.match.write import write_out, write_log logger = logging.getLogger(__name__) @@ -20,9 +21,10 @@ def match_variants(): set_logging_level(args.verbose) logger.debug(f"polars n_threads: {pl.threadpool_size()}") - scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) with pl.StringCache(): + scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) + n_target_files = len(glob(args.target)) matches: pl.DataFrame @@ -36,27 +38,31 @@ def match_variants(): match match_mode: case "single": logger.debug(f"Match mode: {match_mode}") - matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.skip_flip) + matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.skip_flip, + args.remove_ambiguous, args.keep_first_match) case "multi": logger.debug(f"Match mode: {match_mode}") - matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, - args.skip_flip) + matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, args.skip_flip, + args.remove_ambiguous, args.keep_first_match) case "fast": logger.debug(f"Match mode: {match_mode}") - matches = _fast_match(args.target, scorefile, args.remove_multiallelic, - args.skip_flip) + matches = _fast_match(args.target, scorefile, args.remove_multiallelic, args.skip_flip, + args.remove_ambiguous, args.keep_first_match) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator - valid_matches = filter_scores(scorefile, matches, args.remove_ambiguous, args.keep_first_match, args.min_overlap, - dataset) + valid_matches, filter_summary = filter_scores(scorefile, matches, args.remove_ambiguous, + args.keep_first_match, args.min_overlap, dataset) if valid_matches.is_empty(): # this can happen if args.min_overlap = 0 logger.error("Error: no target variants match any variants in scoring files") raise Exception + big_log, summary_log = make_logs(scorefile, matches, filter_summary, args.dataset) + + write_log(big_log, args.dataset) write_out(valid_matches, args.split, args.outdir, dataset) @@ -70,37 +76,37 @@ def _check_target_chroms(target) -> None: def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - skip_filp: bool) -> pl.DataFrame: + skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: # fast match is fast because: # 1) all target files are read into memory # 2) matching occurs without iterating through chromosomes target: pl.DataFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic) logger.debug("Split target chromosomes not checked with fast match mode") - return get_all_matches(scorefile, target, skip_filp) + return get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match) def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - skip_filp: bool) -> pl.DataFrame: + skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: matches = [] for i, loc_target_current in enumerate(glob(target_path)): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.DataFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic) _check_target_chroms(target) - matches.append(get_all_matches(scorefile, target, skip_filp)) + matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match)) return pl.concat(matches) def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - skip_filp: bool) -> pl.DataFrame: + skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: matches = [] for chrom in scorefile['chr_name'].unique().to_list(): target = read_target(target_path, remove_multiallelic=remove_multiallelic, single_file=True, chrom=chrom) # scans and filters if target: logger.debug(f"Matching chromosome {chrom}") - matches.append(get_all_matches(scorefile, target, skip_filp)) + matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match)) return pl.concat(matches) diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py deleted file mode 100644 index 0ae0b2b..0000000 --- a/pgscatalog_utils/match/postprocess.py +++ /dev/null @@ -1,86 +0,0 @@ -import logging -from functools import reduce - -import polars as pl - -from pgscatalog_utils.match.preprocess import complement_valid_alleles - -logger = logging.getLogger(__name__) - - -def postprocess_matches(df: pl.DataFrame) -> pl.DataFrame: - """ Label match candidates with additional metadata. Column definitions: - - - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function) - - best_match: True if row is the best possible match type (refalt > altref > ...) - - duplicate: True if >1 scoring file line matches to the same variant ID - - ambiguous: True if ambiguous - """ - return (df.with_column(pl.lit(True).alias('match_candidate')) - .pipe(_label_biallelic_ambiguous) - .pipe(_label_pruned_matches)) - - -def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame: - logger.debug("Labelling ambiguous variants") - df = df.with_columns([ - pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), - pl.lit(True).alias("ambiguous") - ]).pipe(complement_valid_alleles, ["REF"]) - - return (df.with_column( - pl.when(pl.col("REF_FLIP") == pl.col("ALT")) - .then(pl.col("ambiguous")) - .otherwise(False))) - - -def _label_pruned_matches(df: pl.DataFrame) -> pl.DataFrame: - best_matches = (df.pipe(_label_best_match) - .pipe(_label_duplicates)) - - # check that duplicates were correctly labelled - u_counts = best_matches.filter(pl.col('duplicate') == False).groupby(['accession', 'ID']).count() - assert (u_counts['count'] == 1).all(), \ - "Duplicate effect weights for a variant: {}".format(list(u_counts['accession'].unique())) - - labelled = (df.join(best_matches, how='left', on=['row_nr', 'accession', 'ID']) - .select(pl.exclude("^.*_right$"))) - assert labelled.shape[0] == df.shape[0] # don't want to lose any rows from the input df - - return labelled - - -def _label_duplicates(df: pl.DataFrame) -> pl.DataFrame: - """ Label scorefile (accession) matches with only one ID match (singletons) vs. multiple (duplicates)""" - logger.debug('Labelling multiple accession - ID rows as duplicates') - - join_cols = ['accession', 'ID'] - counted = df.groupby(join_cols).count() - singletons = (counted.filter(pl.col('count') == 1)[:, join_cols] - .join(df, on=join_cols, how='left') - .with_column(pl.lit(False).alias('duplicate'))) - duplicates = (counted.filter(pl.col('count') > 1)[:, join_cols] - .join(df, on=join_cols, how='left') - .with_column(pl.lit(True).alias('duplicate'))) - - return pl.concat([singletons, duplicates]) - - -def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: - match_priority = ['refalt', 'altref', 'refalt_flip', 'altref_flip', 'no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip', - 'no_oa_alt_flip'] - match: list[pl.DataFrame] = [] - for match_type in match_priority: - logger.debug(f"Selecting matches with match type {match_type}") - match.append(df.filter(pl.col("match_type") == match_type)) - - logger.debug("Labelling best match type (refalt > altref > ...)") - best_match: pl.DataFrame = reduce(lambda x, y: _prioritise_best_match(x, y), match) - return best_match.with_column(pl.lit(True).alias('best_match')) - - -def _prioritise_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame: - # variants in dataframe x have a higher priority than dataframe y - # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x - not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'ID', 'row_nr']) - return pl.concat([x, not_in]) From 2abfbe12f79b6b87be893b6843ed04a0e73203ba Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 16:38:57 +0100 Subject: [PATCH 34/59] update filter to use new flags --- pgscatalog_utils/match/filter.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py index caa9360..5256a57 100644 --- a/pgscatalog_utils/match/filter.py +++ b/pgscatalog_utils/match/filter.py @@ -2,14 +2,12 @@ import polars as pl -from pgscatalog_utils.match.log import write_log - logger = logging.getLogger(__name__) def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool, - min_overlap: float, dataset: str) -> pl.DataFrame: - """ Remove scores that don't match well """ + min_overlap: float, dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]: + """ Remove scores that don't match well and return a summary report df""" scorefile: pl.DataFrame = scorefile.with_columns([ pl.col('effect_type').cast(pl.Categorical), pl.col('accession').cast(pl.Categorical)]) # same dtypes for join @@ -17,7 +15,7 @@ def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguo # matches may contain more than one row per variant in the scoring file # e.g., one ambiguous match and one clear match, or duplicates may be in the scoring file filtered_matches: pl.DataFrame = _filter_matches(matches, remove_ambiguous, keep_first_match) - match_log: pl.DataFrame = _join_matches(filtered_matches, scorefile, dataset) + match_log: pl.DataFrame = _join_filtered_matches(filtered_matches, scorefile, dataset) match_log['best_match'] = match_log['best_match'].fill_null(False) fail_rates: pl.DataFrame = _calculate_match_rate(match_log) @@ -27,17 +25,17 @@ def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguo if rate < (1 - min_overlap): df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [True], 'match_rate': [1 - rate]}) logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%} variants match)") - scores.append(df) + scores.append(df.with_column(pl.col('accession').cast(pl.Categorical))) else: df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [False], 'match_rate': [1 - rate]}) logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)") - scores.append(df) + scores.append(df.with_column(pl.col('accession').cast(pl.Categorical))) - (match_log.with_column(pl.col('accession').cast(str)) - .join(pl.concat(scores), on='accession', how='left')).pipe(write_log, dataset) # write log to gzipped CSV + score_summary: pl.DataFrame = pl.concat(scores) + filtered_scores: pl.DataFrame = (filtered_matches.join(score_summary, on='accession', how='left') + .filter(pl.col('score_pass') == True)) - return (filtered_matches.with_column(pl.col('accession').cast(str)) - .join(pl.concat(scores), on='accession', how='left')) + return filtered_scores, score_summary def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame: @@ -70,18 +68,15 @@ def _handle_duplicates(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame singletons = df.filter(pl.col('duplicate') == False) if keep_first_match: logger.debug("Filtering: keeping first match") - first = (df.filter(pl.col('duplicate') == True) - .groupby(["accession", "ID"]) - .agg([pl.col("row_nr").first()]) - .join(df, on=['accession', 'row_nr'], how='left')) - return pl.concat([singletons, first.select(singletons.columns)]) + first = df.filter((pl.col('duplicate') == True) & (pl.col('exclude') == False)) + return pl.concat([singletons, first]) else: logger.debug("Filtering: dropping any duplicate matches") return singletons -def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame: - return (scorefile.join(matches, on=['accession', 'row_nr'], how='left') +def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame: + return (scorefile.join(matches, on=['row_nr', 'accession'], how='left') .with_column(pl.lit(dataset).alias('dataset')) .select(pl.exclude("^.*_right$"))) From cae522e54ee1a3b179f942aba2395c7e669a670c Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 16:39:12 +0100 Subject: [PATCH 35/59] move write_log to write module --- pgscatalog_utils/match/log.py | 4 ---- pgscatalog_utils/match/write.py | 7 +++++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 13085cb..467800b 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -6,10 +6,6 @@ logger = logging.getLogger(__name__) -def write_log(df: pl.DataFrame, dataset: str) -> None: - logger.debug("Compressing and writing log") - with gzip.open(f"{dataset}_log.csv.gz", 'wb') as f: - df.pipe(_prettify_log).write_csv(f) def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 7a8a880..8243c8f 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -1,3 +1,4 @@ +import gzip import logging import os @@ -6,6 +7,12 @@ logger = logging.getLogger(__name__) +def write_log(df: pl.DataFrame, dataset: str) -> None: + logger.debug("Compressing and writing log") + with gzip.open(f"{dataset}_log.csv.gz", 'wb') as f: + df.write_csv(f) + + def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: if not os.path.isdir(outdir): os.mkdir(outdir) From a08ca6e3589828a0e0596d93839aad4c2330dcf0 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 16:39:27 +0100 Subject: [PATCH 36/59] make a summary log --- pgscatalog_utils/match/log.py | 41 ++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 467800b..53cdb07 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -1,4 +1,3 @@ -import gzip import logging import polars as pl @@ -6,10 +5,42 @@ logger = logging.getLogger(__name__) +def make_logs(scorefile, match_candidates, filter_summary, dataset): + big_log = (_join_match_candidates(scorefile, match_candidates, dataset) + .pipe(_prettify_log)) + summary_log = make_summary_log(big_log, filter_summary) + + return _prettify_log(big_log), summary_log + + +def make_summary_log(df, filter_summary): + """ Make an aggregated table """ + return (df.filter(pl.col('best_match') != False) + .groupby(['dataset', 'accession', 'best_match', 'ambiguous', 'is_multiallelic', 'duplicate', 'exclude']) + .count() + .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], reverse=True) + + +def _prettify_summary(df: pl.DataFrame): + keep_cols = ["dataset", "accession", "score_pass", "ambiguous", "is_multiallelic", "duplicate", "count"] def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: - keep_cols = ["chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type", - "accession", "row_nr", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", - "ambiguous", "duplicate", "best_match", "dataset", "score_pass", "match_rate"] - return df.select(keep_cols).select(pl.exclude("^.*_right")) + keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", + "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", + "ambiguous", "duplicate", "best_match", "exclude", "dataset"] + pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right"))) + return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"]) + + +def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, dataset: str) -> pl.DataFrame: + """ + Join match candidates against the original scoring file + + Uses an outer join because mltiple match candidates may exist with different match types + + Multiple match candidates will exist as extra rows in the joined dataframe + """ + return (scorefile.join(matches, on=['row_nr', 'accession'], how='outer') + .with_column(pl.lit(dataset).alias('dataset')) + .select(pl.exclude("^.*_right$"))) From 1a07125af087a48121ed3abca83075438dda1745 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 16:39:32 +0100 Subject: [PATCH 37/59] cast accession to categorical when first reading scorefile --- pgscatalog_utils/match/read.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index f8f5b3e..d1824a2 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -48,7 +48,8 @@ def read_target(path: str, remove_multiallelic: bool, single_file: bool = False, def read_scorefile(path: str) -> pl.DataFrame: logger.debug("Reading scorefile") scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str}) - .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])) + .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele']) + .with_column(pl.col('accession').cast(pl.Categorical))) return scorefile From a59b31324d0d594a0d8a284dedeed4fe05adf0bc Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 17:54:30 +0100 Subject: [PATCH 38/59] update polars --- poetry.lock | 13 +++++++------ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index e920a73..d776774 100644 --- a/poetry.lock +++ b/poetry.lock @@ -49,7 +49,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "coverage" -version = "6.4.3" +version = "6.4.4" description = "Code coverage measurement for Python" category = "dev" optional = false @@ -134,19 +134,20 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.13.62" +version = "0.14.9" description = "Blazingly fast DataFrame library" category = "main" optional = false python-versions = ">=3.7" [package.extras] +pandas = ["pyarrow (>=4.0)", "pandas"] +connectorx = ["connectorx"] +numpy = ["numpy (>=1.16.0)"] fsspec = ["fsspec"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] -connectorx = ["connectorx"] -pandas = ["pyarrow (>=4.0)", "pandas"] +pytz = ["pytz"] pyarrow = ["pyarrow (>=4.0)"] -numpy = ["numpy (>=1.16.0)"] [[package]] name = "py" @@ -288,7 +289,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [metadata] lock-version = "1.1" python-versions = "^3.10" -content-hash = "31cffdaa5cb10864005af569ed7ab3142071abe6934d06789bac6a00ca2ba1ee" +content-hash = "607d2d543f52a4ecc116c0b912c499a83cd1c740244323c81fdfe89ba27a55eb" [metadata.files] atomicwrites = [] diff --git a/pyproject.toml b/pyproject.toml index acfcb36..b9899ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ pandas = "^1.4.3" pyliftover = "^0.4" requests = "^2.28.1" jq = "^1.2.2" -polars = "^0.13.59" +polars = "^0.14.9" [tool.poetry.dev-dependencies] pytest = "^7.1.2" From d2c3fdc0bb0cd2d2101e9c0c52911b3b877d26fc Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 17:54:38 +0100 Subject: [PATCH 39/59] don't set columns directly --- pgscatalog_utils/match/filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py index 5256a57..b34850f 100644 --- a/pgscatalog_utils/match/filter.py +++ b/pgscatalog_utils/match/filter.py @@ -15,8 +15,8 @@ def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguo # matches may contain more than one row per variant in the scoring file # e.g., one ambiguous match and one clear match, or duplicates may be in the scoring file filtered_matches: pl.DataFrame = _filter_matches(matches, remove_ambiguous, keep_first_match) - match_log: pl.DataFrame = _join_filtered_matches(filtered_matches, scorefile, dataset) - match_log['best_match'] = match_log['best_match'].fill_null(False) + match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset) + .with_columns(pl.col('best_match').fill_null(False))) fail_rates: pl.DataFrame = _calculate_match_rate(match_log) From 6e46eb845f50ded5a93d3c303b07cebfc6237bcc Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 17:54:58 +0100 Subject: [PATCH 40/59] encode match status (matched / unmatched / excluded) --- pgscatalog_utils/match/label.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 4291fb6..485040d 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -15,10 +15,21 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da - duplicate: True if more than one best match exists for the same accession and ID - ambiguous: True if ambiguous """ - return (df.with_column(pl.lit(True).alias('match_candidate')) - .pipe(_label_biallelic_ambiguous, remove_ambiguous) - .pipe(_label_best_match) - .pipe(_label_duplicate_best_match, keep_first_match)) + labelled = (df.with_column(pl.lit(True).alias('match_candidate')) + .pipe(_label_biallelic_ambiguous, remove_ambiguous) + .pipe(_label_best_match) + .pipe(_label_duplicate_best_match, keep_first_match)) + + # encode a new column called match status containing matched, unmatched, and excluded + return (labelled.with_columns([ + # set false best match to excluded + pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 2}[x]).alias('match_priority'), + pl.col('exclude').apply(lambda x: {None: 0, True: 2, False: 0}[x]).alias('excluded_match_priority') + ]) + .with_column(pl.max(["match_priority", "excluded_match_priority"])) + .with_column(pl.col("max") + .apply(lambda x: {0: 'unmatched', 1: 'matched', 2: 'excluded'}[x]) + .alias('match_status'))).drop(["max", "excluded_match_priority", "match_priority"]) def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame: @@ -26,7 +37,7 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra ambig = ((df.with_columns([ pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), pl.lit(True).alias("ambiguous")]) - .pipe(complement_valid_alleles, ["REF"])) + .pipe(complement_valid_alleles, ["REF"])) .with_column(pl.when(pl.col("REF_FLIP") == pl.col("ALT")) .then(pl.col("ambiguous")) .otherwise(False))) From b7db4a2e0061d55e7a857006d58192906dcf8f3a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 17:55:10 +0100 Subject: [PATCH 41/59] make nice logs --- pgscatalog_utils/match/log.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 53cdb07..dcff82c 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -10,25 +10,30 @@ def make_logs(scorefile, match_candidates, filter_summary, dataset): .pipe(_prettify_log)) summary_log = make_summary_log(big_log, filter_summary) - return _prettify_log(big_log), summary_log + return _prettify_log(big_log), _prettify_summary(summary_log) def make_summary_log(df, filter_summary): """ Make an aggregated table """ - return (df.filter(pl.col('best_match') != False) - .groupby(['dataset', 'accession', 'best_match', 'ambiguous', 'is_multiallelic', 'duplicate', 'exclude']) + return (df.groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate']) .count() - .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], reverse=True) + .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], + reverse=True) def _prettify_summary(df: pl.DataFrame): - keep_cols = ["dataset", "accession", "score_pass", "ambiguous", "is_multiallelic", "duplicate", "count"] + keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", "duplicate", + "count", "percent"] + return (df.with_column((pl.col("count") / pl.sum("count")) + .over(["dataset", "accession"]) + .alias("percent")) + .select(keep_cols)) def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", - "ambiguous", "duplicate", "best_match", "exclude", "dataset"] + "ambiguous", "duplicate", "match_status", "dataset"] pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right"))) return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"]) @@ -42,5 +47,6 @@ def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, datas Multiple match candidates will exist as extra rows in the joined dataframe """ return (scorefile.join(matches, on=['row_nr', 'accession'], how='outer') - .with_column(pl.lit(dataset).alias('dataset')) - .select(pl.exclude("^.*_right$"))) + .with_column(pl.lit(dataset).alias('dataset')) + .select(pl.exclude("^.*_right$"))).with_column(pl.col('match_status').fill_null("unmatched")) + From 4d0c239466ccd2767b0604ddec45cd2c1e24e843 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 7 Sep 2022 18:14:08 +0100 Subject: [PATCH 42/59] remove not best from match log --- pgscatalog_utils/match/label.py | 4 ++-- pgscatalog_utils/match/log.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 485040d..b562594 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -23,12 +23,12 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da # encode a new column called match status containing matched, unmatched, and excluded return (labelled.with_columns([ # set false best match to excluded - pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 2}[x]).alias('match_priority'), + pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 3}[x]).alias('match_priority'), pl.col('exclude').apply(lambda x: {None: 0, True: 2, False: 0}[x]).alias('excluded_match_priority') ]) .with_column(pl.max(["match_priority", "excluded_match_priority"])) .with_column(pl.col("max") - .apply(lambda x: {0: 'unmatched', 1: 'matched', 2: 'excluded'}[x]) + .apply(lambda x: {0: 'unmatched', 1: 'matched', 2: 'excluded', 3: 'not_best'}[x]) .alias('match_status'))).drop(["max", "excluded_match_priority", "match_priority"]) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index dcff82c..d8af47a 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -15,7 +15,8 @@ def make_logs(scorefile, match_candidates, filter_summary, dataset): def make_summary_log(df, filter_summary): """ Make an aggregated table """ - return (df.groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate']) + return (df.filter(pl.col('match_status') != 'not_best') + .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate']) .count() .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], reverse=True) From 1100a4d46019c47587e6de5fabdd29c629f4edc8 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 8 Sep 2022 12:53:25 +0100 Subject: [PATCH 43/59] filter with exclude flag --- pgscatalog_utils/match/filter.py | 45 +++++--------------------------- 1 file changed, 7 insertions(+), 38 deletions(-) diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py index b34850f..c47a449 100644 --- a/pgscatalog_utils/match/filter.py +++ b/pgscatalog_utils/match/filter.py @@ -5,16 +5,10 @@ logger = logging.getLogger(__name__) -def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool, - min_overlap: float, dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]: - """ Remove scores that don't match well and return a summary report df""" - scorefile: pl.DataFrame = scorefile.with_columns([ - pl.col('effect_type').cast(pl.Categorical), - pl.col('accession').cast(pl.Categorical)]) # same dtypes for join - - # matches may contain more than one row per variant in the scoring file - # e.g., one ambiguous match and one clear match, or duplicates may be in the scoring file - filtered_matches: pl.DataFrame = _filter_matches(matches, remove_ambiguous, keep_first_match) +def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, + dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]: + """ Check overlap between filtered matches and scorefile, remove scores that don't match well and report stats """ + filtered_matches: pl.DataFrame = _filter_matches(matches) match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset) .with_columns(pl.col('best_match').fill_null(False))) @@ -45,34 +39,9 @@ def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame: .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate'))) -def _filter_matches(df: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: - logger.debug("Final match candidate filtering") - return (df.filter(pl.col('best_match') == True) - .pipe(_handle_ambiguous, remove_ambiguous) - .pipe(_handle_duplicates, keep_first_match)) - - -def _handle_ambiguous(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame: - if remove_ambiguous: - logger.debug("Filtering: Removing ambiguous matches") - return df.filter(pl.col("ambiguous") == False) - else: - logger.debug("Filtering: Keeping best possible match from ambiguous matches") - ambiguous: pl.DataFrame = df.filter((pl.col("ambiguous") == True) & \ - (pl.col("match_type").str.contains('flip').is_not())) - unambiguous: pl.DataFrame = df.filter(pl.col("ambiguous") == False) - return pl.concat([ambiguous, unambiguous]) - - -def _handle_duplicates(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame: - singletons = df.filter(pl.col('duplicate') == False) - if keep_first_match: - logger.debug("Filtering: keeping first match") - first = df.filter((pl.col('duplicate') == True) & (pl.col('exclude') == False)) - return pl.concat([singletons, first]) - else: - logger.debug("Filtering: dropping any duplicate matches") - return singletons +def _filter_matches(df: pl.DataFrame) -> pl.DataFrame: + logger.debug("Filtering variants with exclude flag") + return df.filter((pl.col('best_match') == True) & (pl.col('exclude') == False)) def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame: From 157374d4d9fdfca74ae199f45d9d75588b04f7ca Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 8 Sep 2022 12:53:46 +0100 Subject: [PATCH 44/59] add new check for best matches with duplicate row numbers --- pgscatalog_utils/match/label.py | 57 +++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index b562594..1636cb1 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -18,7 +18,8 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da labelled = (df.with_column(pl.lit(True).alias('match_candidate')) .pipe(_label_biallelic_ambiguous, remove_ambiguous) .pipe(_label_best_match) - .pipe(_label_duplicate_best_match, keep_first_match)) + .pipe(_label_duplicate_best_match, keep_first_match) + .pipe(_label_duplicate_row_nr)) # encode a new column called match status containing matched, unmatched, and excluded return (labelled.with_columns([ @@ -76,7 +77,21 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: return prioritised.drop(['match_priority', 'best_match_type']) -def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match) -> pl.DataFrame: +def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame: + """ Label best match duplicates made when the scoring file is remapped to a different genome build + + ┌─────────┬────────────────────────┬─────────────┬────────────────┬─────┬────────────┐ + │ row_nr ┆ accession ┆ match_type ┆ ID ┆ REF ┆ best_match │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ cat ┆ str ┆ cat ┆ str ┆ bool │ + ╞═════════╪════════════════════════╪═════════════╪════════════════╪═════╪════════════╡ + │ 1194115 ┆ PGS002244_hmPOS_GRCh37 ┆ altref ┆ 3:50924580:C:A ┆ C ┆ true │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 1194132 ┆ PGS002244_hmPOS_GRCh37 ┆ refalt_flip ┆ 3:50924580:C:A ┆ C ┆ true │ + └─────────┴────────────────────────┴─────────────┴────────────────┴─────┴────────────┘ + + refalt > altref > ... prioritisation doesn't fix this problem because row_nr is different (duplicated by remapping) + """ logger.debug('Labelling duplicated best matches') duplicates = (df.with_column(pl.col('best_match') .count() @@ -104,3 +119,41 @@ def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match) -> pl.DataFr # get the horizontal maximum to combine the exclusion columns for each variant return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"])) .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"}) + + +def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame: + """ A scoring file row_nr in an accession group can be duplicated if a target position has different REF, e.g.: + + ┌────────┬────────────────────────┬────────────┬────────────────┬─────┬────────────┐ + │ row_nr ┆ accession ┆ match_type ┆ ID ┆ REF ┆ best_match │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ cat ┆ str ┆ cat ┆ str ┆ bool │ + ╞════════╪════════════════════════╪════════════╪════════════════╪═════╪════════════╡ + │ 38557 ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt ┆ 3:29588979:A:G ┆ A ┆ true │ + ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 38557 ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt ┆ 3:29588979:T:G ┆ T ┆ true │ + └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘ + """ + logger.debug("Labelling duplicated matches with same row_nr") + labelled: pl.DataFrame = (df.with_column(pl.col('best_match') + .count() + .over(['accession', 'row_nr', 'best_match']) + .alias('count')) + .with_column(pl.when(pl.col('count') > 1) + .then(pl.lit(True)) + .otherwise(pl.lit(False)) + .alias('duplicate')) + .drop('count') + .rename({'row_nr': 'score_row_nr'}) + .with_row_count() # add temporary row count to get first variant + .with_column(pl.when((pl.col("duplicate") == True) & (pl.col("row_nr") != pl.min("row_nr") + .over(["accession", "score_row_nr"]))) + .then(True) + .otherwise(False) + .alias('exclude_duplicate')) + .drop('row_nr') + .rename({'score_row_nr': 'row_nr'})) + + # get the horizontal maximum to combine the exclusion columns for each variant + return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"])) + .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"}) From f583bb74ae6a67ef51d090407af519f192a594ff Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 8 Sep 2022 12:54:23 +0100 Subject: [PATCH 45/59] add assert to log, comparing against input scoring file --- pgscatalog_utils/match/log.py | 57 +++++++++++++++++------- pgscatalog_utils/match/match_variants.py | 7 +-- pgscatalog_utils/match/write.py | 6 +-- 3 files changed, 47 insertions(+), 23 deletions(-) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index d8af47a..1882db2 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -6,16 +6,30 @@ def make_logs(scorefile, match_candidates, filter_summary, dataset): - big_log = (_join_match_candidates(scorefile, match_candidates, dataset) - .pipe(_prettify_log)) - summary_log = make_summary_log(big_log, filter_summary) + # best log -> aggregated into summary_log, one match per scoring file line + # big log -> written to compressed gzip, possibly multiple matches per scoring file line + summary_log, big_log = _join_match_candidates(scorefile=scorefile, matches=match_candidates, + filter_summary=filter_summary, + dataset=dataset) + + # make sure the aggregated best log matches the scoring file accession line count + log_count = (scorefile.groupby("accession") + .count() + .join(summary_log + .groupby(pl.col("accession")) + .agg(pl.sum("count")), + on='accession')) + + assert (log_count['count'] == log_count['count_right']).all(), "Log doesn't match input scoring file" + logger.debug("Log matches input scoring file") return _prettify_log(big_log), _prettify_summary(summary_log) -def make_summary_log(df, filter_summary): +def make_summary_log(best_matches, filter_summary): """ Make an aggregated table """ - return (df.filter(pl.col('match_status') != 'not_best') + logger.debug("Aggregating best match log into a summary table") + return (best_matches .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate']) .count() .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], @@ -39,15 +53,24 @@ def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"]) -def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, dataset: str) -> pl.DataFrame: - """ - Join match candidates against the original scoring file - - Uses an outer join because mltiple match candidates may exist with different match types - - Multiple match candidates will exist as extra rows in the joined dataframe - """ - return (scorefile.join(matches, on=['row_nr', 'accession'], how='outer') - .with_column(pl.lit(dataset).alias('dataset')) - .select(pl.exclude("^.*_right$"))).with_column(pl.col('match_status').fill_null("unmatched")) - +def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, filter_summary: pl.DataFrame, + dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]: + """ Join match candidates against the original scoring file """ + logger.debug("Making big logs") + # make the summary log using the best matched candidates only + summary_log = (scorefile.join(matches.filter(pl.col('match_status') == 'matched'), + on=['row_nr', 'accession'], + how='outer') # left join would make checking line count later pointless + .with_column(pl.lit(dataset).alias('dataset')) + .select(pl.exclude("^.*_right$")) + .with_column(pl.col('match_status').fill_null("unmatched")) + .pipe(make_summary_log, filter_summary)) + + # make a raw log with all match candidates included + raw_log = (scorefile.join(matches, + on=['row_nr', 'accession'], + how='outer') + .with_column(pl.lit(dataset).alias('dataset')) + .select(pl.exclude("^.*_right$"))).with_column(pl.col('match_status').fill_null("unmatched")) + + return summary_log, raw_log diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index b6962e4..336d781 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -53,8 +53,8 @@ def match_variants(): raise Exception dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator - valid_matches, filter_summary = filter_scores(scorefile, matches, args.remove_ambiguous, - args.keep_first_match, args.min_overlap, dataset) + valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=matches, dataset=dataset, + min_overlap=args.min_overlap) if valid_matches.is_empty(): # this can happen if args.min_overlap = 0 logger.error("Error: no target variants match any variants in scoring files") @@ -62,7 +62,8 @@ def match_variants(): big_log, summary_log = make_logs(scorefile, matches, filter_summary, args.dataset) - write_log(big_log, args.dataset) + write_log(big_log, prefix=dataset) + summary_log.write_csv(f"{dataset}_summary.csv") write_out(valid_matches, args.split, args.outdir, dataset) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 8243c8f..32be0cf 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -7,9 +7,9 @@ logger = logging.getLogger(__name__) -def write_log(df: pl.DataFrame, dataset: str) -> None: - logger.debug("Compressing and writing log") - with gzip.open(f"{dataset}_log.csv.gz", 'wb') as f: +def write_log(df: pl.DataFrame, prefix: str) -> None: + logger.debug(f"Compressing and writing log: {prefix}_log.csv.gz") + with gzip.open(f"{prefix}_log.csv.gz", 'wb') as f: df.write_csv(f) From 70b16779cbb11f37b2617417428dea47a2759b53 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 8 Sep 2022 12:54:44 +0100 Subject: [PATCH 46/59] distinct -> unique (deprecated) --- pgscatalog_utils/match/write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 32be0cf..d7a0378 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -92,7 +92,7 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra # 2. use cumcount to number duplicate IDs # 3. join cumcount data on original DF, use this data for splitting ea_count: pl.DataFrame = (df.select(["ID", "effect_allele"]) - .distinct() + .unique() .with_columns([ pl.col("ID").cumcount().over(["ID"]).alias("cumcount"), pl.col("ID").count().over(["ID"]).alias("count") From f232999a86aafd3945cb807a5288bce902ce28c4 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 8 Sep 2022 12:54:57 +0100 Subject: [PATCH 47/59] cast more columns to categorical --- pgscatalog_utils/match/read.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index d1824a2..fd1a4c3 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -49,7 +49,10 @@ def read_scorefile(path: str) -> pl.DataFrame: logger.debug("Reading scorefile") scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str}) .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele']) - .with_column(pl.col('accession').cast(pl.Categorical))) + .with_columns([ + pl.col('accession').cast(pl.Categorical), + pl.col("effect_type").cast(pl.Categorical)])) + return scorefile From 02d43989243bddd7c1a7af5d3105416e5d220843 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 8 Sep 2022 16:20:15 +0100 Subject: [PATCH 48/59] use best_match for summary log --- pgscatalog_utils/match/label.py | 34 +++++++++++++++++++++------------ pgscatalog_utils/match/log.py | 3 ++- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 1636cb1..5fa8f20 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -15,11 +15,12 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da - duplicate: True if more than one best match exists for the same accession and ID - ambiguous: True if ambiguous """ - labelled = (df.with_column(pl.lit(True).alias('match_candidate')) - .pipe(_label_biallelic_ambiguous, remove_ambiguous) + labelled = (df.with_column(pl.lit(False).alias('exclude')) # set up dummy exclude column for _label_* .pipe(_label_best_match) .pipe(_label_duplicate_best_match, keep_first_match) - .pipe(_label_duplicate_row_nr)) + .pipe(_label_duplicate_row_nr) + .pipe(_label_biallelic_ambiguous, remove_ambiguous) + .with_column(pl.lit(True).alias('match_candidate'))) # encode a new column called match status containing matched, unmatched, and excluded return (labelled.with_columns([ @@ -45,15 +46,22 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra if remove_ambiguous: logger.debug("Labelling ambiguous variants with exclude flag") - return ambig.with_column(pl.when(pl.col('ambiguous') == True) - .then(True) - .otherwise(False) - .alias('exclude')) + return (ambig.with_column(pl.when(pl.col('ambiguous') == True) + .then(True) + .otherwise(False) + .alias('exclude_ambiguous')) + .with_column(pl.max(["exclude", "exclude_ambiguous"])) + .drop(["exclude", "exclude_ambiguous"]) + .rename({"max": "exclude"})) else: - return ambig.with_column(pl.lit(False).alias('exclude')) + return (ambig.with_column(pl.lit(False).alias('exclude_ambiguous')) + .with_column(pl.max(["exclude", "ambiguous"])) + .drop(["exclude", "exclude_ambiguous"]) + .rename({"max": "exclude"})) def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: + """ Best matches have the lowest match priority type. Find the best matches and label them. """ logger.debug("Labelling best match type (refalt > altref > ...)") match_priority = {'refalt': 0, 'altref': 1, 'refalt_flip': 2, 'altref_flip': 3, 'no_oa_ref': 4, 'no_oa_alt': 5, 'no_oa_ref_flip': 6, 'no_oa_alt_flip': 7} @@ -118,7 +126,8 @@ def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl. # get the horizontal maximum to combine the exclusion columns for each variant return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"])) - .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"}) + .drop(["exclude", "exclude_duplicate"]) + .rename({"max": "exclude"})) def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame: @@ -150,10 +159,11 @@ def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame: .over(["accession", "score_row_nr"]))) .then(True) .otherwise(False) - .alias('exclude_duplicate')) + .alias('exclude_duplicate_row_nr')) .drop('row_nr') .rename({'score_row_nr': 'row_nr'})) # get the horizontal maximum to combine the exclusion columns for each variant - return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"])) - .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"}) + return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_row_nr"])) + .drop(["exclude", "exclude_duplicate_row_nr"]) + .rename({"max": "exclude"})) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 1882db2..b0a4bf8 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -57,8 +57,9 @@ def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, filte dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]: """ Join match candidates against the original scoring file """ logger.debug("Making big logs") + # make the summary log using the best matched candidates only - summary_log = (scorefile.join(matches.filter(pl.col('match_status') == 'matched'), + summary_log = (scorefile.join(matches.filter(pl.col('best_match') == True), on=['row_nr', 'accession'], how='outer') # left join would make checking line count later pointless .with_column(pl.lit(dataset).alias('dataset')) From 7ad843fa975553412ee0cfe03cef43412dd8ca0f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 8 Sep 2022 17:04:43 +0100 Subject: [PATCH 49/59] reset best_match flag --- pgscatalog_utils/match/label.py | 43 ++++++++++++++++++++------------- pgscatalog_utils/match/log.py | 11 ++++----- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 5fa8f20..b486a11 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -142,26 +142,35 @@ def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame: ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 38557 ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt ┆ 3:29588979:T:G ┆ T ┆ true │ └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘ + + Label the first row with exclude = false, and duplicate rows with exclude = true and best_match = false """ logger.debug("Labelling duplicated matches with same row_nr") labelled: pl.DataFrame = (df.with_column(pl.col('best_match') - .count() - .over(['accession', 'row_nr', 'best_match']) - .alias('count')) - .with_column(pl.when(pl.col('count') > 1) - .then(pl.lit(True)) - .otherwise(pl.lit(False)) - .alias('duplicate')) - .drop('count') - .rename({'row_nr': 'score_row_nr'}) - .with_row_count() # add temporary row count to get first variant - .with_column(pl.when((pl.col("duplicate") == True) & (pl.col("row_nr") != pl.min("row_nr") - .over(["accession", "score_row_nr"]))) - .then(True) - .otherwise(False) - .alias('exclude_duplicate_row_nr')) - .drop('row_nr') - .rename({'score_row_nr': 'row_nr'})) + .count() + .over(['accession', 'row_nr', 'best_match']) + .alias('count')) + .with_column(pl.when(pl.col('count') > 1) + .then(pl.lit(True)) + .otherwise(pl.lit(False)) + .alias('duplicate')) + .drop('count') + .rename({'row_nr': 'score_row_nr'}) + .with_row_count() # add temporary row count to get first variant + .with_column(pl.when((pl.col("duplicate") == True) & (pl.col("row_nr") != pl.min("row_nr") + .over(["accession", "score_row_nr"]))) + .then(True) + .otherwise(False) + .alias('exclude_duplicate_row_nr')) + .with_column(pl.when((pl.col("best_match") == True) & + (pl.col("duplicate") == True) & + (pl.col("row_nr") > pl.min("row_nr")).over( + ["accession", "score_row_nr"])) + .then(False) # reset best match flag for duplicates + .otherwise(pl.col("best_match")) # just keep value from existing column + .alias('best_match_duplicate_row_nr')) + .drop(['row_nr', 'best_match']) + .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'})) # get the horizontal maximum to combine the exclusion columns for each variant return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_row_nr"])) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index b0a4bf8..9bbad27 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -6,19 +6,18 @@ def make_logs(scorefile, match_candidates, filter_summary, dataset): - # best log -> aggregated into summary_log, one match per scoring file line - # big log -> written to compressed gzip, possibly multiple matches per scoring file line + # summary log -> aggregated from best matches (one per scoring file line) + # big log -> unaggregated, written to compressed gzip, possibly multiple matches per scoring file line summary_log, big_log = _join_match_candidates(scorefile=scorefile, matches=match_candidates, filter_summary=filter_summary, dataset=dataset) # make sure the aggregated best log matches the scoring file accession line count + summary_count = (summary_log.groupby(pl.col('accession')) + .agg(pl.sum('count'))) log_count = (scorefile.groupby("accession") .count() - .join(summary_log - .groupby(pl.col("accession")) - .agg(pl.sum("count")), - on='accession')) + .join(summary_count, on='accession')) assert (log_count['count'] == log_count['count_right']).all(), "Log doesn't match input scoring file" logger.debug("Log matches input scoring file") From 8af2ac1ade81d38d14af6a6c3618ff9cc2752e49 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 9 Sep 2022 15:53:01 +0100 Subject: [PATCH 50/59] make labelling clearer --- pgscatalog_utils/match/label.py | 173 ++++++++++++++++---------------- pgscatalog_utils/match/log.py | 9 +- 2 files changed, 94 insertions(+), 88 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index b486a11..fffd77f 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -17,14 +17,14 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da """ labelled = (df.with_column(pl.lit(False).alias('exclude')) # set up dummy exclude column for _label_* .pipe(_label_best_match) - .pipe(_label_duplicate_best_match, keep_first_match) - .pipe(_label_duplicate_row_nr) + .pipe(_label_duplicate_best_match) + .pipe(_label_duplicate_id, keep_first_match) .pipe(_label_biallelic_ambiguous, remove_ambiguous) .with_column(pl.lit(True).alias('match_candidate'))) - # encode a new column called match status containing matched, unmatched, and excluded + # encode a new column called match status containing matched, unmatched, excluded, and not_best return (labelled.with_columns([ - # set false best match to excluded + # set false best match to not_best pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 3}[x]).alias('match_priority'), pl.col('exclude').apply(lambda x: {None: 0, True: 2, False: 0}[x]).alias('excluded_match_priority') ]) @@ -34,32 +34,6 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da .alias('match_status'))).drop(["max", "excluded_match_priority", "match_priority"]) -def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame: - logger.debug("Labelling ambiguous variants") - ambig = ((df.with_columns([ - pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), - pl.lit(True).alias("ambiguous")]) - .pipe(complement_valid_alleles, ["REF"])) - .with_column(pl.when(pl.col("REF_FLIP") == pl.col("ALT")) - .then(pl.col("ambiguous")) - .otherwise(False))) - - if remove_ambiguous: - logger.debug("Labelling ambiguous variants with exclude flag") - return (ambig.with_column(pl.when(pl.col('ambiguous') == True) - .then(True) - .otherwise(False) - .alias('exclude_ambiguous')) - .with_column(pl.max(["exclude", "exclude_ambiguous"])) - .drop(["exclude", "exclude_ambiguous"]) - .rename({"max": "exclude"})) - else: - return (ambig.with_column(pl.lit(False).alias('exclude_ambiguous')) - .with_column(pl.max(["exclude", "ambiguous"])) - .drop(["exclude", "exclude_ambiguous"]) - .rename({"max": "exclude"})) - - def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: """ Best matches have the lowest match priority type. Find the best matches and label them. """ logger.debug("Labelling best match type (refalt > altref > ...)") @@ -85,7 +59,56 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: return prioritised.drop(['match_priority', 'best_match_type']) -def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame: +def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame: + """ A scoring file row_nr in an accession group can be duplicated if a target position has different REF, e.g.: + + ┌────────┬────────────────────────┬────────────┬────────────────┬─────┬────────────┐ + │ row_nr ┆ accession ┆ match_type ┆ ID ┆ REF ┆ best_match │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ cat ┆ str ┆ cat ┆ str ┆ bool │ + ╞════════╪════════════════════════╪════════════╪════════════════╪═════╪════════════╡ + │ 38557 ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt ┆ 3:29588979:A:G ┆ A ┆ true │ + ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 38557 ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt ┆ 3:29588979:T:G ┆ T ┆ true │ + └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘ + + Label the first row with exclude = false, and duplicate rows with exclude = true and best_match = false + """ + logger.debug("Labelling duplicated best match: flagging first instance with exclude = false") + labelled: pl.DataFrame = (df.with_column(pl.col('best_match') + .count() + .over(['accession', 'row_nr', 'best_match']) + .alias('count')) + .with_column(pl.when((pl.col('count') > 1) & (pl.col('best_match') == True)) + .then(pl.lit(True)) + .otherwise(pl.lit(False)) + .alias('duplicate_best_match')) + .drop('count') + .rename({'row_nr': 'score_row_nr'}) + .with_row_count() # add temporary row count to get first variant + .with_column(pl.when((pl.col("duplicate_best_match") == True) & + (pl.col("row_nr") != pl.min("row_nr") + .over(["accession", "score_row_nr"]))) + .then(True) + .otherwise(False) + .alias('exclude_duplicate_best_match')) + .with_column(pl.when((pl.col("best_match") == True) & + (pl.col("duplicate_best_match") == True) & + (pl.col("row_nr") > pl.min("row_nr")).over( + ["accession", "score_row_nr"])) + .then(False) # reset best match flag for duplicates + .otherwise(pl.col("best_match")) # just keep value from existing column + .alias('best_match_duplicate_row_nr')) + .drop(['row_nr', 'best_match']) + .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'})) + + # get the horizontal maximum to combine the exclusion columns for each variant + return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_best_match"])) + .drop(["exclude", "exclude_duplicate_best_match"]) + .rename({"max": "exclude"})) + + +def _label_duplicate_id(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame: """ Label best match duplicates made when the scoring file is remapped to a different genome build ┌─────────┬────────────────────────┬─────────────┬────────────────┬─────┬────────────┐ @@ -100,29 +123,31 @@ def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl. refalt > altref > ... prioritisation doesn't fix this problem because row_nr is different (duplicated by remapping) """ - logger.debug('Labelling duplicated best matches') - duplicates = (df.with_column(pl.col('best_match') - .count() - .over(['accession', 'ID', 'best_match']) + logger.debug('Labelling multiple scoring file lines (accession/row_nr) that best_match to the same variant') + + # the window in .over() starts with accession + ID + # best_match is added to not count: same row_nr, different match_type (_label_best_match) + # duplicate_best_match is added to not count: same row_nr, same match_type (_label_duplicate_best_match) + duplicates = (df.with_column(pl.count("ID") + .over(['accession', 'ID', 'best_match', 'duplicate_best_match']) .alias('count')) - .with_column(pl.when(pl.col('count') > 1) + .with_column(pl.when((pl.col('count') > 1) & (pl.col('best_match') == True)) .then(pl.lit(True)) .otherwise(pl.lit(False)) - .alias('duplicate')) - .drop('count')) + .alias('duplicate_ID'))) if keep_first_match: logger.debug("Keeping first duplicate, labelling others with exclude flag ") # set first duplicate (with the smallest row_nr) to exclude = false - labelled = duplicates.with_column(pl.when((pl.col("duplicate") == True) & + labelled = duplicates.with_column(pl.when((pl.col("duplicate_ID") == True) & (pl.col("row_nr") != pl.min("row_nr") - .over(["accession", "ID", "duplicate"]))) + .over(["accession", "ID", "duplicate_ID"]))) .then(True) .otherwise(False) .alias('exclude_duplicate')) else: logger.debug("Labelling all duplicates with exclude flag") - labelled = duplicates.with_column(pl.lit(False).alias('exclude_duplicate')) + labelled = duplicates.with_column(pl.col('duplicate_ID').alias('exclude_duplicate')) # get the horizontal maximum to combine the exclusion columns for each variant return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"])) @@ -130,49 +155,29 @@ def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl. .rename({"max": "exclude"})) -def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame: - """ A scoring file row_nr in an accession group can be duplicated if a target position has different REF, e.g.: +def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame: + logger.debug("Labelling ambiguous variants") + ambig = ((df.with_columns([ + pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), + pl.lit(True).alias("ambiguous")]) + .pipe(complement_valid_alleles, ["REF"])) + .with_column(pl.when(pl.col("REF_FLIP") == pl.col("ALT")) + .then(pl.col("ambiguous")) + .otherwise(False))) - ┌────────┬────────────────────────┬────────────┬────────────────┬─────┬────────────┐ - │ row_nr ┆ accession ┆ match_type ┆ ID ┆ REF ┆ best_match │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ cat ┆ str ┆ cat ┆ str ┆ bool │ - ╞════════╪════════════════════════╪════════════╪════════════════╪═════╪════════════╡ - │ 38557 ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt ┆ 3:29588979:A:G ┆ A ┆ true │ - ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 38557 ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt ┆ 3:29588979:T:G ┆ T ┆ true │ - └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘ + if remove_ambiguous: + logger.debug("Labelling ambiguous variants with exclude flag") + return (ambig.with_column(pl.when(pl.col('ambiguous') == True) + .then(True) + .otherwise(False) + .alias('exclude_ambiguous')) + .with_column(pl.max(["exclude", "exclude_ambiguous"])) + .drop(["exclude", "exclude_ambiguous"]) + .rename({"max": "exclude"})) + else: + return (ambig.with_column(pl.lit(False).alias('exclude_ambiguous')) + .with_column(pl.max(["exclude", "ambiguous"])) + .drop(["exclude", "exclude_ambiguous"]) + .rename({"max": "exclude"})) - Label the first row with exclude = false, and duplicate rows with exclude = true and best_match = false - """ - logger.debug("Labelling duplicated matches with same row_nr") - labelled: pl.DataFrame = (df.with_column(pl.col('best_match') - .count() - .over(['accession', 'row_nr', 'best_match']) - .alias('count')) - .with_column(pl.when(pl.col('count') > 1) - .then(pl.lit(True)) - .otherwise(pl.lit(False)) - .alias('duplicate')) - .drop('count') - .rename({'row_nr': 'score_row_nr'}) - .with_row_count() # add temporary row count to get first variant - .with_column(pl.when((pl.col("duplicate") == True) & (pl.col("row_nr") != pl.min("row_nr") - .over(["accession", "score_row_nr"]))) - .then(True) - .otherwise(False) - .alias('exclude_duplicate_row_nr')) - .with_column(pl.when((pl.col("best_match") == True) & - (pl.col("duplicate") == True) & - (pl.col("row_nr") > pl.min("row_nr")).over( - ["accession", "score_row_nr"])) - .then(False) # reset best match flag for duplicates - .otherwise(pl.col("best_match")) # just keep value from existing column - .alias('best_match_duplicate_row_nr')) - .drop(['row_nr', 'best_match']) - .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'})) - # get the horizontal maximum to combine the exclusion columns for each variant - return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_row_nr"])) - .drop(["exclude", "exclude_duplicate_row_nr"]) - .rename({"max": "exclude"})) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 9bbad27..b58bb55 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -29,15 +29,16 @@ def make_summary_log(best_matches, filter_summary): """ Make an aggregated table """ logger.debug("Aggregating best match log into a summary table") return (best_matches - .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate']) + .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate_best_match', + 'duplicate_ID']) .count() .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], reverse=True) def _prettify_summary(df: pl.DataFrame): - keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", "duplicate", - "count", "percent"] + keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", + "duplicate_best_match", "duplicate_ID", "count", "percent"] return (df.with_column((pl.col("count") / pl.sum("count")) .over(["dataset", "accession"]) .alias("percent")) @@ -47,7 +48,7 @@ def _prettify_summary(df: pl.DataFrame): def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", - "ambiguous", "duplicate", "match_status", "dataset"] + "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"] pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right"))) return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"]) From c3e98a4b7b9d5aac9b387b81f3a5d0689834bdee Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 12 Sep 2022 11:20:48 +0100 Subject: [PATCH 51/59] don't exclude duplicate best matches, just reset the best_match flag --- pgscatalog_utils/match/label.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index fffd77f..0dbd71f 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -72,9 +72,9 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame: │ 38557 ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt ┆ 3:29588979:T:G ┆ T ┆ true │ └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘ - Label the first row with exclude = false, and duplicate rows with exclude = true and best_match = false + Label the first row with best_match = true, and duplicate rows with best_match = false """ - logger.debug("Labelling duplicated best match: flagging first instance with exclude = false") + logger.debug("Labelling duplicated best match: keeping first instance as best_match = True") labelled: pl.DataFrame = (df.with_column(pl.col('best_match') .count() .over(['accession', 'row_nr', 'best_match']) @@ -86,12 +86,6 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame: .drop('count') .rename({'row_nr': 'score_row_nr'}) .with_row_count() # add temporary row count to get first variant - .with_column(pl.when((pl.col("duplicate_best_match") == True) & - (pl.col("row_nr") != pl.min("row_nr") - .over(["accession", "score_row_nr"]))) - .then(True) - .otherwise(False) - .alias('exclude_duplicate_best_match')) .with_column(pl.when((pl.col("best_match") == True) & (pl.col("duplicate_best_match") == True) & (pl.col("row_nr") > pl.min("row_nr")).over( @@ -102,10 +96,7 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame: .drop(['row_nr', 'best_match']) .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'})) - # get the horizontal maximum to combine the exclusion columns for each variant - return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_best_match"])) - .drop(["exclude", "exclude_duplicate_best_match"]) - .rename({"max": "exclude"})) + return labelled def _label_duplicate_id(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame: From a92383a207aca3cb2778fa1cce2a1ba27d1856d5 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 12 Sep 2022 13:52:53 +0100 Subject: [PATCH 52/59] fix percent --- pgscatalog_utils/match/log.py | 7 +++---- tests/match/__init__.py | 0 tests/match/test_label.py | 0 tests/{ => match}/test_match.py | 0 4 files changed, 3 insertions(+), 4 deletions(-) create mode 100644 tests/match/__init__.py create mode 100644 tests/match/test_label.py rename tests/{ => match}/test_match.py (100%) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index b58bb55..b214aaa 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -32,14 +32,13 @@ def make_summary_log(best_matches, filter_summary): .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate_best_match', 'duplicate_ID']) .count() - .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], - reverse=True) + .join(filter_summary, how='left', on='accession')) def _prettify_summary(df: pl.DataFrame): keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", "duplicate_best_match", "duplicate_ID", "count", "percent"] - return (df.with_column((pl.col("count") / pl.sum("count")) + return (df.with_column((pl.col("count") / pl.sum("count") * 100) .over(["dataset", "accession"]) .alias("percent")) .select(keep_cols)) @@ -50,7 +49,7 @@ def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"] pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right"))) - return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"]) + return pretty_df def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, filter_summary: pl.DataFrame, diff --git a/tests/match/__init__.py b/tests/match/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/match/test_label.py b/tests/match/test_label.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_match.py b/tests/match/test_match.py similarity index 100% rename from tests/test_match.py rename to tests/match/test_match.py From e530f1547051e233c1d2f51bda1c6237c6a651a0 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 12 Sep 2022 13:53:15 +0100 Subject: [PATCH 53/59] add _encode_match_priority function --- pgscatalog_utils/match/label.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 0dbd71f..e3d059d 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -22,8 +22,12 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da .pipe(_label_biallelic_ambiguous, remove_ambiguous) .with_column(pl.lit(True).alias('match_candidate'))) - # encode a new column called match status containing matched, unmatched, excluded, and not_best - return (labelled.with_columns([ + return _encode_match_priority(labelled) + + +def _encode_match_priority(df: pl.DataFrame) -> pl.DataFrame: + """ Encode a new column called match status containing matched, unmatched, excluded, and not_best """ + return (df.with_columns([ # set false best match to not_best pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 3}[x]).alias('match_priority'), pl.col('exclude').apply(lambda x: {None: 0, True: 2, False: 0}[x]).alias('excluded_match_priority') @@ -31,7 +35,8 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da .with_column(pl.max(["match_priority", "excluded_match_priority"])) .with_column(pl.col("max") .apply(lambda x: {0: 'unmatched', 1: 'matched', 2: 'excluded', 3: 'not_best'}[x]) - .alias('match_status'))).drop(["max", "excluded_match_priority", "match_priority"]) + .alias('match_status') + .cast(pl.Categorical)).drop(["max", "excluded_match_priority", "match_priority"])) def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: From c5affc668ff008ae31562a1466462a924a92cd2f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 12 Sep 2022 16:30:09 +0100 Subject: [PATCH 54/59] fix labels when keeping ambiguous variants --- pgscatalog_utils/match/label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index e3d059d..0d38ccb 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -172,7 +172,7 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra .rename({"max": "exclude"})) else: return (ambig.with_column(pl.lit(False).alias('exclude_ambiguous')) - .with_column(pl.max(["exclude", "ambiguous"])) + .with_column(pl.max(["exclude", "exclude_ambiguous"])) .drop(["exclude", "exclude_ambiguous"]) .rename({"max": "exclude"})) From 800bf33f8da84c39ec552488a0fbc93851c98df1 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 12 Sep 2022 16:30:25 +0100 Subject: [PATCH 55/59] sort big log --- pgscatalog_utils/match/log.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index b214aaa..91f3999 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -48,7 +48,9 @@ def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"] - pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right"))) + pretty_df = (df.select(keep_cols) + .select(pl.exclude("^.*_right")) + .sort(["accession", "row_nr", "chr_name", "chr_position"])) return pretty_df From 1a4a14ef2e2772a080aa353f6a36236927b134e7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 12 Sep 2022 16:30:36 +0100 Subject: [PATCH 56/59] add label tests --- conftest.py | 40 ++++++++++++ tests/match/test_label.py | 125 ++++++++++++++++++++++++++++++++++++++ tests/match/test_match.py | 54 ++++------------ 3 files changed, 177 insertions(+), 42 deletions(-) diff --git a/conftest.py b/conftest.py index e322b96..5027d61 100644 --- a/conftest.py +++ b/conftest.py @@ -3,11 +3,13 @@ from unittest.mock import patch import pandas as pd +import polars as pl import pytest import requests as req from pysqlar import SQLiteArchive from pgscatalog_utils.download.download_scorefile import download_scorefile +from pgscatalog_utils.match.preprocess import complement_valid_alleles from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles @@ -141,6 +143,44 @@ def hg19_coords(hg38_coords): return pd.DataFrame(d) +@pytest.fixture +def small_flipped_scorefile(small_scorefile): + # simulate a scorefile on the wrong strand + return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele']) + .drop(['effect_allele', 'other_allele']) + .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'}) + .pipe(complement_valid_alleles, ['effect_allele', 'other_allele'])) + + +@pytest.fixture +def small_target(): + return pl.DataFrame({"#CHROM": [1, 2, 3], + "POS": [1, 2, 3], + "REF": ["A", "T", "T"], + "ALT": ["C", "A", "G"], + "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"], + "is_multiallelic": [False, False, False]}) + + +@pytest.fixture +def small_scorefile(): + df = pl.DataFrame({"accession": ["test", "test", "test"], + "row_nr": [1, 2, 3], + "chr_name": [1, 2, 3], + "chr_position": [1, 2, 3], + "effect_allele": ["A", "A", "G"], + "other_allele": ["C", "T", "T"], + "effect_weight": [1, 2, 3], + "effect_type": ["additive", "additive", "additive"]}) + + return complement_valid_alleles(df, ["effect_allele", "other_allele"]) + + +@pytest.fixture +def small_scorefile_no_oa(small_scorefile): + return small_scorefile.with_column(pl.lit(None).alias('other_allele')) + + def _get_timeout(url): try: return req.get(url, timeout=5) diff --git a/tests/match/test_label.py b/tests/match/test_label.py index e69de29..8198335 100644 --- a/tests/match/test_label.py +++ b/tests/match/test_label.py @@ -0,0 +1,125 @@ +""" Test that matches have the correct labels, which is important for edge case handling and summary stats """ + +import logging +import pytest +import polars as pl + +from pgscatalog_utils.match.match import get_all_matches +from tests.match.test_match import _cast_cat + +logger = logging.getLogger(__name__) + + +def test_label_best_match(multiple_match_types): + """ Test that multiple match candidates are correctly prioritised """ + # both matches are flagged as candidates + assert multiple_match_types['match_candidate'].to_list() == [True, True] + # but the matches have different match types + assert multiple_match_types['match_type'].to_list() == ["altref", "refalt_flip"] + # only one match candidate can survive! + assert multiple_match_types['best_match'].to_list() == [True, False] + assert multiple_match_types['match_status'].to_list() == ["matched", "not_best"] + # however, exclude is _only_ for omitting a 'best match' from the final results, e.g. because of duplication + assert multiple_match_types['exclude'].to_list() == [False, False] + # match candidates are filtered by best_match == True and exclude == False + + +def test_label(small_scorefile, small_target): + """ Test typical labels for match candidates with one match per position """ + scorefile, target = _cast_cat(small_scorefile, small_target) + + # get_all_matches calls label_matches + labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False) + + logger.debug(labelled.select(['ID', 'match_type', 'best_match', 'ambiguous', 'match_status', 'exclude'])) + + assert labelled['best_match'].to_list() == [True, True, True] + assert labelled['ambiguous'].to_list() == [False, True, False] + assert labelled['exclude'].to_list() == [False, True, False] + assert labelled['match_status'].to_list() == ["matched", "excluded", "matched"] + + +def test_ambiguous_label(small_flipped_scorefile, small_target): + """ Test ambiguous variant labels change when they're kept for match candidates with one match per position """ + scorefile, target = _cast_cat(small_flipped_scorefile, small_target) + + no_ambiguous = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False) + + assert no_ambiguous['best_match'].to_list() == [True] + assert no_ambiguous['ambiguous'].to_list() == [True] + assert no_ambiguous['exclude'].to_list() == [True] + assert no_ambiguous['match_status'].to_list() == ["excluded"] + + # otherwise, ambiguous variants are kept + labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) + + assert labelled['best_match'].to_list() == [True] + assert labelled['ambiguous'].to_list() == [True] + assert labelled['exclude'].to_list() == [False] + assert labelled['match_status'].to_list() == ["matched"] + + +def test_duplicate_best_match(duplicated_matches, request): + # these matches come from different lines in the original scoring file + assert duplicated_matches["row_nr"].to_list() == [1, 4] + # but they have the same ID! + assert duplicated_matches["ID"].to_list() == ["1:1:A:C", "1:1:A:C"] + # and they're matched with the same match type + assert duplicated_matches["match_type"].to_list() == ["refalt", "refalt"] + # oh dear, they're both the best match + assert duplicated_matches["best_match"].to_list() == [True, True] + # however, we've flagged them as duplicate IDs + assert duplicated_matches['duplicate_ID'].to_list() == [True, True] + + if request.node.callspec.id == "keep_first_match": + # and correctly label _the first occurring match_ as best match + assert duplicated_matches['exclude'].to_list() == [False, True] + assert duplicated_matches['match_status'].to_list() == ["matched", "excluded"] + elif request.node.callspec.id == "delete_both": + # and correctly labelled all duplicate instances for exclusion (default behaviour) + assert duplicated_matches['exclude'].to_list() == [True, True] + assert duplicated_matches['match_status'].to_list() == ["excluded", "excluded"] + + +def test_duplicate_best_match(duplicate_best_match): + # all best matches come from the same row number in the original scoring file + assert duplicate_best_match['row_nr'].to_list() == [1, 1, 1] + # and the match type is duplicated, so we can't prioritise + assert duplicate_best_match['match_type'].to_list() == ['no_oa_alt', 'no_oa_alt', 'no_oa_ref_flip'] + # find the duplicate best matches (with the same match type) + assert duplicate_best_match['duplicate_best_match'].to_list() == [True, True, False] + # and only keep the first occurring best match. the worse match type is correctly set to not_best too. + assert duplicate_best_match['match_status'].to_list() == ["matched", "not_best", "not_best"] + assert duplicate_best_match['best_match'].to_list() == [True, False, False] + + +@pytest.fixture(params=[True, False], ids=["keep_first_match", "delete_both"]) +def duplicated_matches(small_scorefile, small_target, request): + # pgs catalog scorefiles can contain the same variant remapped to multiple rows + # this happens after liftover to a different genome build + # row_nrs will be different, but other information may be the same + dups = (pl.concat([small_scorefile, small_scorefile]) + .with_column(pl.Series(list(range(1, 7))) + .alias('row_nr')) + .filter(pl.col('chr_name') == 1)) + + scorefile, target = _cast_cat(dups, small_target) + + return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=request.param) + + +@pytest.fixture +def multiple_match_types(small_target, small_scorefile): + # skip flip will return two candidate matches for one target position: refalt + refalt_flip + scorefile, target = _cast_cat(small_scorefile, small_target) + return (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) + .filter(pl.col('chr_name') == 2)) + + +@pytest.fixture +def duplicate_best_match(small_target, small_scorefile_no_oa): + # this type of target genome can sometimes occur when the REF is different at the same position + odd_target = {'#CHROM': [1, 1], 'POS': [1, 1], 'REF': ['T', 'C'], 'ALT': ['A', 'A'], 'ID': ['1:1:T:C', '1:1:A:A'], + 'is_multiallelic': [False, False]} + scorefile, target = _cast_cat(small_scorefile_no_oa, pl.DataFrame(odd_target)) + return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) diff --git a/tests/match/test_match.py b/tests/match/test_match.py index 42d0e87..2c1c8f4 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -1,3 +1,5 @@ +""" Test that match strategies return the expected match results""" + from unittest.mock import patch import polars as pl @@ -5,7 +7,6 @@ from pgscatalog_utils.match.match import get_all_matches, _cast_categorical from pgscatalog_utils.match.match_variants import match_variants -from pgscatalog_utils.match.preprocess import complement_valid_alleles def test_match_fail(combined_scorefile, target_path, tmp_path): @@ -46,12 +47,14 @@ def test_match_strategies(small_scorefile, small_target): scorefile, target = _cast_cat(small_scorefile, small_target) # check unambiguous matches - df = get_all_matches(scorefile, target, skip_flip=True).filter(pl.col('ambiguous') == False) + df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) + .filter(pl.col('ambiguous') == False)) assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'}) assert set(df['match_type'].to_list()).issubset(['altref', 'refalt']) # when keeping ambiguous and flipping alleles - flip = (get_all_matches(scorefile, target, skip_flip=False).filter(pl.col('ambiguous') == True)) + flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) + .filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'altref', 'refalt_flip'}) @@ -60,13 +63,14 @@ def test_match_strategies(small_scorefile, small_target): def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) - df = get_all_matches(scorefile, target, skip_flip=True).filter(pl.col('ambiguous') == False) + df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) + .filter(pl.col('ambiguous') == False)) assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref']) # check ambiguous matches - flip = (get_all_matches(scorefile, target, skip_flip=False) + flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) .filter(pl.col('ambiguous') == True)) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'no_oa_alt', 'no_oa_ref_flip'}) @@ -75,48 +79,14 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): def test_flip_match(small_flipped_scorefile, small_target): scorefile, target = _cast_cat(small_flipped_scorefile, small_target) - df = get_all_matches(scorefile, target, skip_flip=True) + df = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) assert set(df['ambiguous']) == {True} assert set(df['match_type']) == {'refalt'} - flip = get_all_matches(scorefile, target, skip_flip=False).filter(pl.col('ambiguous') == False) + flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) + .filter(pl.col('ambiguous') == False)) assert flip['match_type'].str.contains('flip').all() assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) -@pytest.fixture -def small_scorefile(): - df = pl.DataFrame({"accession": ["test", "test", "test"], - "row_nr": [1, 2, 3], - "chr_name": [1, 2, 3], - "chr_position": [1, 2, 3], - "effect_allele": ["A", "A", "G"], - "other_allele": ["C", "T", "T"], - "effect_weight": [1, 2, 3], - "effect_type": ["additive", "additive", "additive"]}) - - return complement_valid_alleles(df, ["effect_allele", "other_allele"]) - - -@pytest.fixture -def small_scorefile_no_oa(small_scorefile): - return small_scorefile.with_column(pl.lit(None).alias('other_allele')) - - -@pytest.fixture -def small_flipped_scorefile(small_scorefile): - # simulate a scorefile on the wrong strand - return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele']) - .drop(['effect_allele', 'other_allele']) - .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'}) - .pipe(complement_valid_alleles, ['effect_allele', 'other_allele'])) - -@pytest.fixture -def small_target(): - return pl.DataFrame({"#CHROM": [1, 2, 3], - "POS": [1, 2, 3], - "REF": ["A", "T", "T"], - "ALT": ["C", "A", "G"], - "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"], - "is_multiallelic": [False, False, False]}) From 1a1529bdd771218b6438bd3ee7c5b106cb812273 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 12 Sep 2022 17:17:07 +0100 Subject: [PATCH 57/59] use session scopes for fixtures in conftest.py --- conftest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conftest.py b/conftest.py index 5027d61..46631c7 100644 --- a/conftest.py +++ b/conftest.py @@ -143,7 +143,7 @@ def hg19_coords(hg38_coords): return pd.DataFrame(d) -@pytest.fixture +@pytest.fixture(scope='session') def small_flipped_scorefile(small_scorefile): # simulate a scorefile on the wrong strand return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele']) @@ -152,7 +152,7 @@ def small_flipped_scorefile(small_scorefile): .pipe(complement_valid_alleles, ['effect_allele', 'other_allele'])) -@pytest.fixture +@pytest.fixture(scope='session') def small_target(): return pl.DataFrame({"#CHROM": [1, 2, 3], "POS": [1, 2, 3], @@ -162,7 +162,7 @@ def small_target(): "is_multiallelic": [False, False, False]}) -@pytest.fixture +@pytest.fixture(scope='session') def small_scorefile(): df = pl.DataFrame({"accession": ["test", "test", "test"], "row_nr": [1, 2, 3], @@ -176,7 +176,7 @@ def small_scorefile(): return complement_valid_alleles(df, ["effect_allele", "other_allele"]) -@pytest.fixture +@pytest.fixture(scope='session') def small_scorefile_no_oa(small_scorefile): return small_scorefile.with_column(pl.lit(None).alias('other_allele')) From f1886d04bf528af28b4b822f33b8858366fe3123 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 13 Sep 2022 11:28:04 +0100 Subject: [PATCH 58/59] fix removing multiallelic variants with new polars version --- pgscatalog_utils/match/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index c6dbe47..1723f6d 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -42,7 +42,7 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic " "variant representations only") logger.debug('Dropping multiallelic variants') - return df[~df['is_multiallelic']] + return df.filter(~df['is_multiallelic']) else: logger.debug("Exploding dataframe to handle multiallelic variants") df.replace('ALT', df['ALT'].str.split(by=',')) # turn ALT to list of variants From ce8b4beff57bb83aa57fc5d63a0977a46c422951 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 13 Sep 2022 15:07:53 +0100 Subject: [PATCH 59/59] Fix incorrect deduplication (wasn't using matched_effect_allele): this caused splitting into 3 files when a flipped variant was also matched --- pgscatalog_utils/match/write.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index d7a0378..53eb15f 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -76,7 +76,7 @@ def _split_effect_type(df: pl.DataFrame) -> dict[str, pl.DataFrame]: def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFrame]: """ Find variant matches that have duplicate identifiers When merging a lot of scoring files, sometimes a variant might be duplicated - this can happen when the effect allele differs at the same position, e.g.: + this can happen when the matched effect allele differs at the same position, e.g.: - chr1: chr2:20003:A:C A 0.3 NA - chr1: chr2:20003:A:C C NA 0.7 where the last two columns represent different scores. plink demands @@ -85,20 +85,20 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra df: A dataframe containing all matches, with columns ID, effect_allele, and effect_weight Returns: - A list of dataframes, with unique ID - effect allele combinations + A list of dataframes, with unique ID - matched effect allele combinations """ # 1. unique ID - EA is important because normal duplicates are already # handled by pivoting, and it's pointless to split them unnecessarily # 2. use cumcount to number duplicate IDs # 3. join cumcount data on original DF, use this data for splitting - ea_count: pl.DataFrame = (df.select(["ID", "effect_allele"]) + ea_count: pl.DataFrame = (df.select(["ID", "matched_effect_allele"]) .unique() .with_columns([ pl.col("ID").cumcount().over(["ID"]).alias("cumcount"), pl.col("ID").count().over(["ID"]).alias("count") ])) - dup_label: pl.DataFrame = df.join(ea_count, on=["ID", "effect_allele"], how="left") + dup_label: pl.DataFrame = df.join(ea_count, on=["ID", "matched_effect_allele"], how="left") # now split the matched variants, and make sure we don't lose any n_splits: int = ea_count.select("cumcount").max()[0, 0] + 1 # cumcount = ngroup-1