From e038c8d2ced33ebcbc89a90d29857fc60b1453d9 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Thu, 25 Aug 2022 12:27:37 +0100
Subject: [PATCH 01/59] Revised behaviour of combine_scorefiles to not crash
 when it encounters a duplicated variant position.

---
 .../scorefile/combine_scorefiles.py            | 18 ++++++++++--------
 pgscatalog_utils/scorefile/qc.py               |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 35d9b85..ef20102 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -43,19 +43,21 @@ def _read_and_melt(path, drop_missing: bool = False):
 
 def _description_text() -> str:
     return textwrap.dedent('''\
-    Combine multiple scoring files in PGS Catalog format (see 
-    https://www.pgscatalog.org/downloads/ for details) to a 'long'
-    table, and optionally liftover genomic coordinates to GRCh37 or
-    GRCh38. Custom scorefiles in PGS Catalog format can be combined
-    with PGS Catalog scoring files. The program can accept a mix of
-    unharmonised and harmonised PGS Catalog data.     
+    Combine multiple scoring files in PGS Catalog format (see https://www.pgscatalog.org/downloads/ 
+    for details) to a 'long' table of columns needed for variant matching and subsequent calculation. 
+    
+    Custom scorefiles in PGS Catalog format can be combined with PGS Catalog scoring files, and 
+    optionally liftover genomic coordinates to GRCh37 or GRCh38. The script can accept a mix of
+    unharmonised and harmonised PGS Catalog data. By default all variants are output (including 
+    positions with duplicated data [often caused by rsID/liftover collions across builds]) and 
+    variants with missing positions. 
     ''')
 
 
 def _epilog_text() -> str:
     return textwrap.dedent('''\
-    The long table is used to simplify intersecting variants in target
-    genomes and the scoring files with the match_variants program.    
+    The long table is used to simplify intersecting variants in target genotyping datasets 
+    and the scoring files with the match_variants program.
     ''')
 
 
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 4316f1e..9b8c98e 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -71,7 +71,7 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame:
     if unique.all():
         return df
     else:
-        raise Exception("Duplicate variants in scoring file")
+        logger.warning("Duplicate variants in scoring file.")
 
 
 def _check_shape(df: pd.DataFrame) -> None:

From 73154ec48b4debc9a2c662859f586533cd0b983e Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Thu, 25 Aug 2022 12:50:44 +0100
Subject: [PATCH 02/59] Needs to return df

---
 pgscatalog_utils/scorefile/qc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 9b8c98e..ef3cc8a 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -72,6 +72,7 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame:
         return df
     else:
         logger.warning("Duplicate variants in scoring file.")
+        return df
 
 
 def _check_shape(df: pd.DataFrame) -> None:

From a4dabb3575b14c74c472e77baf6522ad348c5ca2 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Fri, 26 Aug 2022 12:14:55 +0100
Subject: [PATCH 03/59] Flag duplicated variants within scorefiles (output
 is_duplicated as bool)

---
 pgscatalog_utils/scorefile/combine_scorefiles.py |  2 +-
 pgscatalog_utils/scorefile/qc.py                 | 11 ++++++-----
 pgscatalog_utils/scorefile/write.py              |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index ef20102..810ff9b 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -77,7 +77,7 @@ def _parse_args(args=None) -> argparse.Namespace:
                         required="--liftover" in sys.argv, default=0.95, type=float)
     parser.add_argument('--drop_missing', dest='drop_missing', action='store_true',
                         help='Drop variants with missing information (chr/pos) and '
-                             'non-standard alleles from the output file.')
+                             'non-standard alleles (e.g. HLA=P/N) from the output file.')
     parser.add_argument('-o', '--outfile', dest='outfile', required=True,
                         default='combined.txt',
                         help='<Required> Output path to combined long scorefile')
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index ef3cc8a..bd38991 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -66,14 +66,15 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame:
         logger.warning("Other allele column not detected, dropping other_allele from variant identifier.")
         group_cols = ['chr_name', 'chr_position', 'effect_allele']
 
-    unique: pd.Series = df.groupby(group_cols).size() == 1
+    u_count: pd.Series = df.groupby(group_cols).size()
 
-    if unique.all():
-        return df
+    if all(u_count == 1):
+        return df.assign(is_duplicated=False)
     else:
         logger.warning("Duplicate variants in scoring file.")
-        return df
-
+        u_count = u_count > 1
+        u_count.name = 'is_duplicated'
+        return pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True)
 
 def _check_shape(df: pd.DataFrame) -> None:
     assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)"
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 9204096..1f22197 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -7,7 +7,7 @@
 
 def write_scorefile(df: pd.DataFrame, path: str) -> None:
     cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
-                       'accession']
+                       'is_duplicated', 'accession']
 
     if df.empty:
         logger.error("Empty scorefile output! Please check the input data")

From 7051f9dca098d596993a8886ef4a0998e6f7b848 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Fri, 26 Aug 2022 12:21:11 +0100
Subject: [PATCH 04/59] Handle null variants, fix test

---
 pgscatalog_utils/scorefile/qc.py | 4 +++-
 tests/test_combine.py            | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index bd38991..0e96368 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -74,7 +74,9 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame:
         logger.warning("Duplicate variants in scoring file.")
         u_count = u_count > 1
         u_count.name = 'is_duplicated'
-        return pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True)
+        df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True)
+        df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos
+        return df
 
 def _check_shape(df: pd.DataFrame) -> None:
     assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)"
diff --git a/tests/test_combine.py b/tests/test_combine.py
index 6243cef..f9ee7a1 100644
--- a/tests/test_combine.py
+++ b/tests/test_combine.py
@@ -7,7 +7,8 @@
 
 def test_combine_scorefiles(combined_scorefile, _n_variants):
     df = pd.read_table(combined_scorefile)
-    cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession'}
+    cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
+            'is_duplicated', 'accession'}
     assert set(df.columns).issubset(cols)
     assert df.shape[0] == _n_variants
 

From 912d1bd0e1ead87561e9fbef1d17be673e4bf4d1 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Fri, 26 Aug 2022 12:38:23 +0100
Subject: [PATCH 05/59] Implement gzipped output if filename endswith '.gz'

---
 pgscatalog_utils/scorefile/combine_scorefiles.py | 1 +
 pgscatalog_utils/scorefile/write.py              | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 810ff9b..39bad2b 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -79,6 +79,7 @@ def _parse_args(args=None) -> argparse.Namespace:
                         help='Drop variants with missing information (chr/pos) and '
                              'non-standard alleles (e.g. HLA=P/N) from the output file.')
     parser.add_argument('-o', '--outfile', dest='outfile', required=True,
+                        help='[ will compress output if filename ends with .gz ]',
                         default='combined.txt',
                         help='<Required> Output path to combined long scorefile')
     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 1f22197..f9762b1 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -13,7 +13,6 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None:
         logger.error("Empty scorefile output! Please check the input data")
         raise Exception
     else:
-        logger.debug("Writing out combined scorefile")
         out_df: pd.DataFrame = (df.drop('accession', axis=1)
                                 .rename({'filename_prefix': 'accession'}, axis=1)
                                 .pipe(_filter_failed_liftover))
@@ -21,8 +20,12 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None:
         if 'other_allele' not in out_df:
             logger.warning("No other allele information detected, writing out as missing data")
             out_df['other_allele'] = None
-
-        out_df[cols].to_csv(path, index=False, sep="\t")
+        if path.endswith('.gz'):
+            logger.debug("Writing out gzip-compressed combined scorefile")
+            out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip')
+        else:
+            logger.debug("Writing out combined scorefile")
+            out_df[cols].to_csv(path, index=False, sep="\t")
 
 
 def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame:

From ec5577f91b62547f44b9194ea549e6ab51a8e1fe Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Fri, 26 Aug 2022 12:41:33 +0100
Subject: [PATCH 06/59] Typo

---
 pgscatalog_utils/scorefile/combine_scorefiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 39bad2b..925ee27 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -79,9 +79,9 @@ def _parse_args(args=None) -> argparse.Namespace:
                         help='Drop variants with missing information (chr/pos) and '
                              'non-standard alleles (e.g. HLA=P/N) from the output file.')
     parser.add_argument('-o', '--outfile', dest='outfile', required=True,
-                        help='[ will compress output if filename ends with .gz ]',
                         default='combined.txt',
-                        help='<Required> Output path to combined long scorefile')
+                        help='<Required> Output path to combined long scorefile '
+                             '[ will compress output if filename ends with .gz ]')
     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
                         help='<Optional> Extra logging information')
     return parser.parse_args(args)

From eee720ef3bba5031b1127469d45ad4c5f5e90fae Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Fri, 26 Aug 2022 12:53:12 +0100
Subject: [PATCH 07/59] Scoring-file specific warning

---
 pgscatalog_utils/scorefile/qc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 0e96368..ff5c942 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -71,7 +71,7 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame:
     if all(u_count == 1):
         return df.assign(is_duplicated=False)
     else:
-        logger.warning("Duplicate variants in scoring file.")
+        logger.warning("Duplicate variants in scoring file: {}".format(df['filename_prefix'].unique()))
         u_count = u_count > 1
         u_count.name = 'is_duplicated'
         df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True)

From 9cb49c55ae7342f821127023fa650b7efb739243 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Fri, 26 Aug 2022 16:31:15 +0100
Subject: [PATCH 08/59] More explict handling of genome_build so that files
 from different builds can't be combined.

---
 .../scorefile/combine_scorefiles.py           | 53 ++++++++++++----
 pgscatalog_utils/scorefile/genome_build.py    | 60 +++++++------------
 pgscatalog_utils/scorefile/liftover.py        |  6 +-
 pgscatalog_utils/scorefile/qc.py              |  4 +-
 pgscatalog_utils/scorefile/read.py            | 57 +++++++++++++++---
 5 files changed, 122 insertions(+), 58 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 925ee27..39637db 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -7,12 +7,14 @@
 
 from pgscatalog_utils.log_config import set_logging_level
 from pgscatalog_utils.scorefile.read import load_scorefile
+from pgscatalog_utils.scorefile.harmonised import remap_harmonised
+from pgscatalog_utils.scorefile.qc import quality_control
+from pgscatalog_utils.scorefile.genome_build import build2GRC
 from pgscatalog_utils.scorefile.effect_type import set_effect_type
 from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights
 from pgscatalog_utils.scorefile.liftover import liftover
 from pgscatalog_utils.scorefile.write import write_scorefile
 
-
 def combine_scorefiles():
     args = _parse_args()
 
@@ -21,7 +23,43 @@ def combine_scorefiles():
 
     paths: list[str] = list(set(args.scorefiles))  # unique paths only
     logger.debug(f"Input scorefiles: {paths}")
-    scorefiles: pd.DataFrame = pd.concat([_read_and_melt(x, drop_missing=args.drop_missing) for x in paths])
+
+    scorefiles = []
+    for x in paths:
+        # Read scorefile df and header
+        h, score = load_scorefile(x)
+
+        # Check if we should use the harmonized positions
+        use_harmonised = False
+        current_build = None
+        if h.get('HmPOS_build') is not None:
+            if h.get('HmPOS_build') == args.target_build:
+                use_harmonised = True
+                current_build = h.get('HmPOS_build')
+            else:
+                logger.error(f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}")
+                raise Exception
+
+
+
+        score = (score.pipe(remap_harmonised, use_harmonised=True)
+                  .pipe(quality_control, drop_missing=args.drop_missing)
+                  .pipe(melt_effect_weights)
+                  .pipe(set_effect_type).assign(genome_build=current_build))
+        # Check if the score is in the right build or could be lifted
+        if current_build is None:
+            current_build = build2GRC(h.get('genome_build'))
+
+        if (current_build != args.target_build) and (args.liftover is False):
+            logger.error(
+                f"Cannot combine {x} (build={h.get('genome_build')}) with target build {args.target_build} without liftover")
+            logger.error("Try running with --liftover and specifying the --chain_dir")
+            raise Exception
+
+        scorefiles.append(score)
+
+
+    scorefiles: pd.DataFrame = pd.concat(scorefiles)
 
     if args.liftover:
         logger.debug("Annotating scorefiles with liftover parameters")
@@ -30,12 +68,6 @@ def combine_scorefiles():
     write_scorefile(scorefiles, args.outfile)
 
 
-def _read_and_melt(path, drop_missing: bool = False):
-    """ Load a scorefile, melt it, and set the effect types"""
-    return (load_scorefile(path, drop_missing=drop_missing)
-            .pipe(melt_effect_weights)
-            .pipe(set_effect_type))
-
 
 if __name__ == "__main__":
     combine_scorefiles()
@@ -68,8 +100,9 @@ def _parse_args(args=None) -> argparse.Namespace:
                         help='<Required> Scorefile path (wildcard * is OK)', required=True)
     parser.add_argument('--liftover', dest='liftover',
                         help='<Optional> Convert scoring file variants to target genome build?', action='store_true')
-    parser.add_argument('-t', '--target_build', dest='target_build', help='Build of target genome <GRCh37 / GRCh38>',
-                        required='--liftover' in sys.argv)
+    parser.add_argument('-t', '--target_build', dest='target_build',
+                        choices=['GRCh37', 'GRCh38'], help='Build of target genome',
+                        required=True)
     parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files',
                         required="--liftover" in sys.argv)
     parser.add_argument('-m', '--min_lift', dest='min_lift',
diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py
index 9ded7ed..d145a2f 100644
--- a/pgscatalog_utils/scorefile/genome_build.py
+++ b/pgscatalog_utils/scorefile/genome_build.py
@@ -1,55 +1,41 @@
-import gzip
-import io
 import logging
-import re
-from typing import TextIO
 import pandas as pd
 
+from pgscatalog_utils.scorefile.read import _read_header
+
 logger = logging.getLogger(__name__)
 
 
 def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame:
-    """ Annotate the dataframe with genome build data """
+    """ Annotate the dataframe with genome build data  """
     logger.debug(f"Annotating target build: {target_build}")
     build_dict: dict = {'GRCh37': 'hg19', 'GRCh38': 'hg38', 'hg19': 'hg19', 'hg38': 'hg38'}  # standardise build names
-    df['target_build'] = build_dict[target_build]
-
-    builds: pd.DataFrame = _get_builds(df['filename'].drop_duplicates())
-    builds['genome_build'] = builds.apply(lambda x: build_dict[x.genome_build], axis=1)
-    return df.merge(builds, how="left", on="filename")
-
-
-def _read_header(f: TextIO) -> str:
-    """ Extract genome build of scorefile from PGS Catalog header format """
-    for line in f:
-        if re.search("^#genome_build", line):
-            # get #genome_build=GRCh37 from header
-            header = line.replace('\n', '').replace('#', '').split('=')
-            # and remap to liftover style
-            try:
-                build: str = header[-1]
-                logger.debug(f"Valid genome build detected: {build}")
-                return build
-            except KeyError:
-                raise Exception("Bad genome build detected in header")
-        elif line[0] != '#':
-            raise Exception("No genome build detected in header")
+    df['chain_target_build'] = build_dict[target_build]
+    df = df.assign(chain_genome_build=[build_dict[x] for x in df['genome_build']])
+    return df
+
+def build2GRC(build):
+    """Map build names so they can be compared with GRCh37 and 38"""
+    build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', 'hg38': 'GRCh38'}  # standardise build names
+    if build is None:
+        return None
+    else:
+        return build_2_GRC_dict.get(build)
+
 
 
 def _read_build(path: str) -> str:
     """ Open scorefiles and automatically handle compressed input """
     logger.debug(f'Reading header of {path}')
-    try:
-        with io.TextIOWrapper(gzip.open(path, 'r')) as f:
-            return _read_header(f)
-    except gzip.BadGzipFile:
-        with open(path, 'r') as f:
-            return _read_header(f)
+    h = _read_header(path)
+    return {k: h.get(k, None) for k in ('genome_build', 'HmPOS_build')}
 
 
-def _get_builds(s: pd.Series) -> pd.DataFrame:
+def _get_builds(paths: list) -> pd.DataFrame:
     """ Get genome builds for a series of scorefile paths
-        | filename | -> | filename | genome_build |
-        | x.txt.gz |    | x.txt.gz | hg19         |
+        | filename              | -> |                       | genome_build | HmPOS_build |
+        | x.txt.gz              |    | x.txt.gz              | hg19         | None        |
+        | x_hmPOS_GRCh37.txt.gz |    | x_hmPOS_GRCh37.txt.gz | hg19         | GRCh37      |
     """
-    return pd.concat([s, s.apply(_read_build).rename("genome_build")], axis=1)
+    return pd.DataFrame.from_dict({path: _read_build(path) for path in paths}, orient='index')
+
diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index 0d3008c..2680a09 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -12,8 +12,8 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st
     df = annotate_build(df, target_build)  # grab build from scoring file headers
 
     mapped, unmapped = pd.DataFrame(), pd.DataFrame()
-    no_liftover: pd.DataFrame = df.query('target_build == genome_build')
-    to_liftover: pd.DataFrame = df.query('target_build != genome_build')
+    no_liftover: pd.DataFrame = df.query('chain_target_build == chain_genome_build')
+    to_liftover: pd.DataFrame = df.query('chain_target_build != chain_genome_build')
 
     if no_liftover.empty:
         logger.debug("Liftover required for all scorefile variants")
@@ -65,7 +65,7 @@ def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver])
     if df[['chr_name', 'chr_position']].isnull().values.any():
         converted = None
     else:
-        lo = lo_dict[df['genome_build'] + df['target_build']]  # extract lo object from dict
+        lo = lo_dict[df['chain_genome_build'] + df['chain_target_build']]  # extract lo object from dict
         chrom: str = 'chr' + str(df['chr_name'])
         pos: int = int(df['chr_position']) - 1  # liftOver is 0 indexed, VCF is 1 indexed
         # converted example: [('chr22', 15460378, '+', 3320966530)] or None
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index ff5c942..36b20c0 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -84,6 +84,8 @@ def _check_shape(df: pd.DataFrame) -> None:
 
 
 def _check_columns(df: pd.DataFrame) -> None:
-    assert {'chr_name', 'chr_position'}.issubset(df.columns), "If you're using rsids did you request harmonised data?"
+    assert {'chr_name', 'chr_position'}.issubset(df.columns), "Missing chromsomal positions. If you're " \
+                                                              "using PGS Catalog files with rsIDs you should request " \
+                                                              "harmonised data files (HmPOS) instead."
     assert 'effect_allele' in df, "ERROR: Missing effect allele column"
 
diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py
index 7674c7c..a3e6997 100644
--- a/pgscatalog_utils/scorefile/read.py
+++ b/pgscatalog_utils/scorefile/read.py
@@ -1,20 +1,46 @@
 import os
 import pandas as pd
 import logging
-from .harmonised import remap_harmonised
-from .qc import quality_control
+
+import gzip
+import io
 
 logger = logging.getLogger(__name__)
 
 
-def load_scorefile(path: str, use_harmonised: bool = True, drop_missing: bool = False) -> pd.DataFrame:
+def load_scorefile(path: str) -> pd.DataFrame:
     logger.debug(f'Reading scorefile {path}')
-    return (pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False)
-            .pipe(remap_harmonised, use_harmonised=use_harmonised)
+    return (_read_header(path),
+            pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False)
             .assign(filename_prefix=_get_basename(path),
-                    filename=path)
-            .pipe(quality_control, drop_missing=drop_missing))
+                    filename=path))
+
+
+def _read_header(path: str) -> dict:
+    """Parses the header of a PGS Catalog format scorefle into a dictionary"""
+    try:
+        f = io.TextIOWrapper(gzip.open(path, 'r'))
+    except gzip.BadGzipFile:
+        f = open(path, 'r')
+
+    header = {}
+    lastline = '#'
+    while lastline.startswith('#'):
+        lastline = f.readline()
+        line = lastline.strip()
+        if line.startswith('#'):
+            if '=' in line:
+                line = line[1:].split('=')
+                field, val = [x.strip() for x in line]
+                if field in remap_header:
+                    header[remap_header[field]] = val
+                else:
+                    header[field] = val
 
+    if ('genome_build' in header) and (header['genome_build'] == 'NR'):
+        header['genome_build'] = None
+    f.close()
+    return header
 
 def _scorefile_dtypes() -> dict[str]:
     """ Data types for columns that might be found in a scorefile """
@@ -27,3 +53,20 @@ def _get_basename(path: str) -> str:
     """ Return the basename of a scoring file without extension """
     return os.path.basename(path).split('.')[0]
 
+remap_header = {
+    'PGS ID': 'pgs_id',
+    'PGS Name': 'pgs_name',
+    'Reported Trait': 'trait_reported',
+    'Original Genome Build': 'genome_build',
+    'Number of Variants': 'variants_number',
+    'PGP ID': 'pgp_id',
+    'Citation': 'citation',
+    'LICENSE': 'license',
+    # Harmonization related
+    'HmPOS Build': 'HmPOS_build',
+    'HmPOS Date':'HmPOS_date',
+    'HmVCF Reference': 'HmVCF_ref',
+    'HmVCF Date': 'HmVCF_date',
+    'HmVCF N Matched Variants': 'HmVCF_n_matched',
+    'HmVCF N Unmapped Variants': 'HmVCF_n_unmapped'
+}  # Used to maintain reverse compatibility to old scoring files
\ No newline at end of file

From 4813c2bdf781c35089dfcd3be10b2df9a95b8bb8 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Fri, 26 Aug 2022 16:37:26 +0100
Subject: [PATCH 09/59] Set genome_build in the correct place

---
 pgscatalog_utils/scorefile/combine_scorefiles.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 39637db..9ab594c 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -40,15 +40,16 @@ def combine_scorefiles():
                 logger.error(f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}")
                 raise Exception
 
-
-
+        # Process/QC score and check variant columns
         score = (score.pipe(remap_harmonised, use_harmonised=True)
                   .pipe(quality_control, drop_missing=args.drop_missing)
                   .pipe(melt_effect_weights)
-                  .pipe(set_effect_type).assign(genome_build=current_build))
+                  .pipe(set_effect_type))
+
         # Check if the score is in the right build or could be lifted
         if current_build is None:
             current_build = build2GRC(h.get('genome_build'))
+            score = score.assign(genome_build=current_build)
 
         if (current_build != args.target_build) and (args.liftover is False):
             logger.error(

From 9c33eee0b98e494513da0ebfcec6fbb61ec2a099 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 26 Aug 2022 17:09:57 +0100
Subject: [PATCH 10/59] prune duplicate variants after match prioritisation

---
 pgscatalog_utils/match/match.py       |  2 --
 pgscatalog_utils/match/postprocess.py | 40 ++++++++++++++++++++-------
 pgscatalog_utils/match/preprocess.py  |  8 ------
 pgscatalog_utils/match/read.py        |  9 +++---
 4 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 6a3f70c..9d92719 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -65,8 +65,6 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap
     return (matches.with_column(pl.col('accession').cast(str))
             .join(pass_df, on='accession', how='left'))
 
-
-
 def _match_keys():
     return ['chr_name', 'chr_position', 'effect_allele', 'other_allele',
             'accession', 'effect_type', 'effect_weight']
diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
index 33a0220..e002d27 100644
--- a/pgscatalog_utils/match/postprocess.py
+++ b/pgscatalog_utils/match/postprocess.py
@@ -30,28 +30,48 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame:
     return (df.with_column(
         pl.when(pl.col("REF_FLIP") == pl.col("ALT"))
         .then(pl.col("ambiguous"))
-        .otherwise(False))).pipe(_get_distinct_weights)
+        .otherwise(False))).pipe(_prune_matches)
 
 
-def _get_distinct_weights(df: pl.DataFrame) -> pl.DataFrame:
+def _prune_matches(df: pl.DataFrame) -> pl.DataFrame:
     """ Select single matched variant in target for each variant in the scoring file (e.g. per accession) """
-    count: pl.DataFrame = df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count()
-    singletons: pl.DataFrame = (count.filter(pl.col('count') == 1)[:, "accession":"effect_allele"]
-                                .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left'))
-
-    dups: pl.DataFrame = (count.filter(pl.col('count') > 1)[:, "accession":"effect_allele"]
-                          .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left'))
+    dups: pl.DataFrame = _get_duplicate_variants(df)
 
     if dups:
-        distinct: pl.DataFrame = pl.concat([singletons, _prioritise_match_type(dups)])
+        logger.debug("First match pruning: prioritise by match types")
+        singletons: pl.DataFrame = _get_singleton_variants(df)
+        prioritised: pl.DataFrame = _prioritise_match_type(dups)
+        prioritised_dups: pl.DataFrame = _get_duplicate_variants(prioritised)
+        if prioritised_dups:
+            logger.debug("Final match pruning: dropping any duplicates remaining")
+            prioritised_singletons: pl.DataFrame = _get_singleton_variants(prioritised)
+            distinct: pl.DataFrame = pl.concat([singletons, prioritised_singletons])
+        else:
+            logger.debug("Final match pruning skipped (not required)")
+            distinct: pl.DataFrame = pl.concat([singletons, prioritised])
     else:
-        distinct: pl.DataFrame = singletons
+        distinct: pl.DataFrame = df
 
     assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant"
+    logger.debug("Match pruning complete")
 
     return distinct
 
 
+def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame:
+    return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele'])
+            .count()
+            .filter(pl.col('count') == 1)[:, "accession":"effect_allele"]
+            .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left'))
+
+
+def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame:
+    return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele'])
+            .count()
+            .filter(pl.col('count') > 1)[:, "accession":"effect_allele"]
+            .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left'))
+
+
 def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame:
     dup_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") != None)
     dup_no_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") == None)
diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index 3cc66f7..29579b2 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -51,13 +51,5 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool)
         return df
 
 
-def check_weights(df: pl.DataFrame) -> None:
-    """ Checks weights for scoring file variants that could be matched (e.g. have a chr & pos) """
-    weight_count = df.filter(pl.col('chr_name').is_not_null() & pl.col('chr_position').is_not_null()).groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count()
-    if any(weight_count['count'] > 1):
-        logger.error("Multiple effect weights per variant per accession detected in files: {}".format(list(weight_count.filter(pl.col('count') > 1)['accession'].unique())))
-        raise Exception
-
-
 def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame:
     df.with_column(pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic'))
\ No newline at end of file
diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index edb69b5..f8f5b3e 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -4,7 +4,7 @@
 
 import polars as pl
 
-from pgscatalog_utils.match.preprocess import handle_multiallelic, check_weights, complement_valid_alleles
+from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles
 
 logger = logging.getLogger(__name__)
 
@@ -33,10 +33,12 @@ def read_target(path: str, remove_multiallelic: bool, single_file: bool = False,
 
     match target.file_format:
         case 'bim':
-            return (df[_default_cols()]
+            return (df.select(_default_cols())
+                    .filter(pl.col('ID') != '.')  # remove missing IDs
                     .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=False))
         case 'pvar':
-            return (df[_default_cols()]
+            return (df.select(_default_cols())
+                    .filter(pl.col('ID') != '.')
                     .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=True))
         case _:
             logger.error("Invalid file format detected")
@@ -47,7 +49,6 @@ def read_scorefile(path: str) -> pl.DataFrame:
     logger.debug("Reading scorefile")
     scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str})
                                .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele']))
-    check_weights(scorefile)
     return scorefile
 
 

From 242e4a330087568f3499f770cb449162fe8e28f3 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Tue, 30 Aug 2022 12:57:34 +0100
Subject: [PATCH 11/59] Add explicit build choices to downloads

---
 pgscatalog_utils/download/download_scorefile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py
index 30f8ac8..fc35529 100644
--- a/pgscatalog_utils/download/download_scorefile.py
+++ b/pgscatalog_utils/download/download_scorefile.py
@@ -116,7 +116,7 @@ def _parse_args(args=None) -> argparse.Namespace:
     parser.add_argument('-t', '--efo', dest='efo', nargs='+',
                         help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
     parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
-    parser.add_argument('-b', '--build', dest='build',
+    parser.add_argument('-b', '--build', dest='build', choices=['GRCh37', 'GRCh38'],
                         help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
     parser.add_argument('-o', '--outdir', dest='outdir', required=True,
                         default='scores/',

From 21dfcb8d3c79181c7bfd589443a1237769392856 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Tue, 30 Aug 2022 12:58:56 +0100
Subject: [PATCH 12/59] Make sure liftover works with chains and mixed files

---
 pgscatalog_utils/scorefile/combine_scorefiles.py | 12 ++++++++----
 pgscatalog_utils/scorefile/genome_build.py       |  2 +-
 pgscatalog_utils/scorefile/liftover.py           |  8 ++++----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 9ab594c..9836cfe 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -46,10 +46,11 @@ def combine_scorefiles():
                   .pipe(melt_effect_weights)
                   .pipe(set_effect_type))
 
-        # Check if the score is in the right build or could be lifted
+        # Annotate score with the genome_build (in GRCh notation)
         if current_build is None:
             current_build = build2GRC(h.get('genome_build'))
-            score = score.assign(genome_build=current_build)
+
+        score = score.assign(genome_build=current_build)
 
         if (current_build != args.target_build) and (args.liftover is False):
             logger.error(
@@ -59,8 +60,11 @@ def combine_scorefiles():
 
         scorefiles.append(score)
 
-
-    scorefiles: pd.DataFrame = pd.concat(scorefiles)
+    if len(scorefiles) > 0:
+        scorefiles: pd.DataFrame = pd.concat(scorefiles)
+    else:
+        logger.error("No valid scorefiles could be combined")
+        raise Exception
 
     if args.liftover:
         logger.debug("Annotating scorefiles with liftover parameters")
diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py
index d145a2f..5fe4488 100644
--- a/pgscatalog_utils/scorefile/genome_build.py
+++ b/pgscatalog_utils/scorefile/genome_build.py
@@ -17,7 +17,7 @@ def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame:
 def build2GRC(build):
     """Map build names so they can be compared with GRCh37 and 38"""
     build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', 'hg38': 'GRCh38'}  # standardise build names
-    if build is None:
+    if pd.isnull(build):
         return None
     else:
         return build_2_GRC_dict.get(build)
diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index 2680a09..ee8902e 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -9,7 +9,7 @@
 
 def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: str) -> pd.DataFrame:
     """ Liftover genomic coordinates to a different genome build """
-    df = annotate_build(df, target_build)  # grab build from scoring file headers
+    df = annotate_build(df, target_build)  # get chain_target_build (e.g. in hg notation to match chain files)
 
     mapped, unmapped = pd.DataFrame(), pd.DataFrame()
     no_liftover: pd.DataFrame = df.query('chain_target_build == chain_genome_build')
@@ -19,15 +19,15 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st
         logger.debug("Liftover required for all scorefile variants")
     else:
         logger.debug("Skipping liftover for scorefiles with same build as target genome")
-        no_liftover[['lifted_chr', 'lifted_pos']] = no_liftover[['chr_name', 'chr_position']]  # assume col structure
+        no_liftover.loc[:,['lifted_chr', 'lifted_pos']] = no_liftover[['chr_name', 'chr_position']]  # assume col structure
         no_liftover.assign(liftover=None)
 
     if to_liftover.empty:
         logger.debug("Liftover skipped because no variants required it")
     else:
+        lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) # loads chain files
         logger.debug("Lifting over scoring files")
-        lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir)
-        to_liftover[['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1)
+        to_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1)
         logger.debug("Liftover complete")
 
         mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)]

From 2461ffb96a3f1a33b16137ec7be2d36d076e076f Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 30 Aug 2022 14:13:37 +0100
Subject: [PATCH 13/59] keep track of variants that pass pruning

---
 pgscatalog_utils/match/match.py          |  8 +++--
 pgscatalog_utils/match/match_variants.py | 22 +++++++-----
 pgscatalog_utils/match/postprocess.py    | 46 ++++++++++++++++++------
 pgscatalog_utils/scorefile/qc.py         |  3 +-
 tests/test_match.py                      | 14 ++++----
 5 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 9d92719..ac566b2 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -9,7 +9,7 @@
 
 
 def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool,
-                    skip_flip: bool) -> pl.DataFrame:
+                    skip_flip: bool, keep_first_match: bool) -> pl.DataFrame:
     scorefile_cat, target_cat = _cast_categorical(scorefile, target)
     scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None)
     scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None)
@@ -35,7 +35,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu
             matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order))
             matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order))
 
-    return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous)
+    return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous, keep_first_match)
 
 
 def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> pl.DataFrame:
@@ -58,6 +58,9 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap
             pass_df = pl.concat([pass_df, df])
             logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)")
 
+    # TODO: fill nulls in certain columns with false in a nicer way
+    match_log['passes_pruning'] = match_log['passes_pruning'].fill_null(False)
+
     # add match statistics to log and matches
     write_log((match_log.with_column(pl.col('accession').cast(str))
                .join(pass_df, on='accession', how='left')), dataset)
@@ -65,6 +68,7 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap
     return (matches.with_column(pl.col('accession').cast(str))
             .join(pass_df, on='accession', how='left'))
 
+
 def _match_keys():
     return ['chr_name', 'chr_position', 'effect_allele', 'other_allele',
             'accession', 'effect_type', 'effect_weight']
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 0d31da6..a0d0230 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -35,15 +35,16 @@ def match_variants():
         match match_mode:
             case "single":
                 logger.debug(f"Match mode: {match_mode}")
-                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous, args.skip_flip)
+                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous,
+                                               args.skip_flip, args.keep_first_match)
             case "multi":
                 logger.debug(f"Match mode: {match_mode}")
                 matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic,
-                                                  args.remove_ambiguous, args.skip_flip)
+                                                  args.remove_ambiguous, args.skip_flip, args.keep_first_match)
             case "fast":
                 logger.debug(f"Match mode: {match_mode}")
                 matches = _fast_match(args.target, scorefile, args.remove_multiallelic,
-                                      args.remove_ambiguous, args.skip_flip)
+                                      args.remove_ambiguous, args.skip_flip, args.keep_first_match)
             case _:
                 logger.critical(f"Invalid match mode: {match_mode}")
                 raise Exception
@@ -69,37 +70,37 @@ def _check_target_chroms(target) -> None:
 
 
 def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame:
+                remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame:
     # fast match is fast because:
     #   1) all target files are read into memory
     #   2) matching occurs without iterating through chromosomes
     target: pl.DataFrame = read_target(path=target_path,
                                        remove_multiallelic=remove_multiallelic)
     logger.debug("Split target chromosomes not checked with fast match mode")
-    return get_all_matches(scorefile, target, remove_ambiguous, skip_filp)
+    return get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match)
 
 
 def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                            remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame:
+                            remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame:
     matches = []
     for i, loc_target_current in enumerate(glob(target_path)):
         logger.debug(f'Matching scorefile(s) against target: {loc_target_current}')
         target: pl.DataFrame = read_target(path=loc_target_current,
                                            remove_multiallelic=remove_multiallelic)  #
         _check_target_chroms(target)
-        matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp))
+        matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match))
     return pl.concat(matches)
 
 
 def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                         remove_ambiguous: bool, skip_filp: bool) -> pl.DataFrame:
+                         remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame:
     matches = []
     for chrom in scorefile['chr_name'].unique().to_list():
         target = read_target(target_path, remove_multiallelic=remove_multiallelic,
                              single_file=True, chrom=chrom)  # scans and filters
         if target:
             logger.debug(f"Matching chromosome {chrom}")
-            matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp))
+            matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match))
 
     return pl.concat(matches)
 
@@ -181,6 +182,9 @@ def _parse_args(args=None):
                         help='''<Optional> Flag to not consider matched variants that may be reported 
                         on the opposite strand.  Default behaviour is to flip/complement unmatched variants and check if
                         they match.''')
+    parser.add_argument('--keep_first_match', dest='keep_first_match', action='store_true',
+                        help='''<Optional> If multiple match candidates for a variant exist that can't be prioritised,
+                         keep the first match candidate (default: drop all candidates)''')
     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
                         help='<Optional> Extra logging information')
     return parser.parse_args(args)
diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
index e002d27..b2e6472 100644
--- a/pgscatalog_utils/match/postprocess.py
+++ b/pgscatalog_utils/match/postprocess.py
@@ -7,8 +7,15 @@
 logger = logging.getLogger(__name__)
 
 
-def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame:
-    df = _label_biallelic_ambiguous(df)
+def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
+    """ Clean up match candidates ready for writing out, including:
+
+    - Label ambiguous variants
+    - Prune match candidates to select the best match for each variant in the scoring file
+    - Optionally remove ambiguous variants
+    """
+    df = _label_biallelic_ambiguous(df).pipe(_prune_matches, keep_first_match)
+
     if remove_ambiguous:
         logger.debug("Removing ambiguous matches")
         return df.filter(pl.col("ambiguous") == False)
@@ -30,11 +37,23 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame:
     return (df.with_column(
         pl.when(pl.col("REF_FLIP") == pl.col("ALT"))
         .then(pl.col("ambiguous"))
-        .otherwise(False))).pipe(_prune_matches)
+        .otherwise(False)))
+
+
+def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFrame:
+    """ Select the best match candidate in the target for each variant in the scoring file
+
+    - In a scoring file (accession), each variant ID with the same effect allele and weight *must be unique*
+    - The variant matching process normally returns multiple match candidates for each variant ID, e.g.:
+        refalt > altref > refalt_flip > altref_flip
+    - When multiple match candidates for an ID exist, they must be prioritised and pruned to be unique
+    - If it's impossible to prioritise match candidates (i.e. same strategy is used), drop all matches by default
 
+    :param df: A dataframe containing multiple match candidates for each variant
+    :param drop_duplicates: If it's impossible to make match candidates unique, drop all candidates?
+    :return: A dataframe containing the best match candidate for each variant
+    """
 
-def _prune_matches(df: pl.DataFrame) -> pl.DataFrame:
-    """ Select single matched variant in target for each variant in the scoring file (e.g. per accession) """
     dups: pl.DataFrame = _get_duplicate_variants(df)
 
     if dups:
@@ -42,12 +61,15 @@ def _prune_matches(df: pl.DataFrame) -> pl.DataFrame:
         singletons: pl.DataFrame = _get_singleton_variants(df)
         prioritised: pl.DataFrame = _prioritise_match_type(dups)
         prioritised_dups: pl.DataFrame = _get_duplicate_variants(prioritised)
-        if prioritised_dups:
-            logger.debug("Final match pruning: dropping any duplicates remaining")
-            prioritised_singletons: pl.DataFrame = _get_singleton_variants(prioritised)
-            distinct: pl.DataFrame = pl.concat([singletons, prioritised_singletons])
+        if prioritised_dups and not keep_first_match:
+            logger.debug("Final match pruning: dropping remaining duplicate matches")
+            distinct: pl.DataFrame = pl.concat([singletons, _get_singleton_variants(prioritised)])
+        elif prioritised_dups and keep_first_match:
+            logger.debug("Final match pruning: keeping first match")
+            distinct: pl.DataFrame = pl.concat([singletons, _get_singleton_variants(prioritised),
+                                                prioritised.unique(maintain_order=True)])
         else:
-            logger.debug("Final match pruning skipped (not required)")
+            logger.debug("Final match pruning unnecessary")
             distinct: pl.DataFrame = pl.concat([singletons, prioritised])
     else:
         distinct: pl.DataFrame = df
@@ -55,10 +77,11 @@ def _prune_matches(df: pl.DataFrame) -> pl.DataFrame:
     assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant"
     logger.debug("Match pruning complete")
 
-    return distinct
+    return distinct.with_column(pl.lit(True).alias('passes_pruning'))
 
 
 def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame:
+    """ Return variants with only one row (match candidate) per variant ID """
     return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele'])
             .count()
             .filter(pl.col('count') == 1)[:, "accession":"effect_allele"]
@@ -66,6 +89,7 @@ def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame:
 
 
 def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame:
+    """ Return variants with more than one row (match candidate) per variant ID """
     return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele'])
             .count()
             .filter(pl.col('count') > 1)[:, "accession":"effect_allele"]
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index ff5c942..fa5a6a8 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -75,9 +75,10 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame:
         u_count = u_count > 1
         u_count.name = 'is_duplicated'
         df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True)
-        df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos
+        df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False  # handles variants with null chr/pos
         return df
 
+
 def _check_shape(df: pd.DataFrame) -> None:
     assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)"
     assert df.shape[0] > 0, "ERROR: No variants detected in input file (0 rows)"
diff --git a/tests/test_match.py b/tests/test_match.py
index 6f3394d..d437aa3 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -46,14 +46,14 @@ def test_match_strategies(small_scorefile, small_target):
     scorefile, target = _cast_cat(small_scorefile, small_target)
 
     # check unambiguous matches
-    df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True)
+    df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True, keep_first_match=False)
     assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'})
     assert set(df['match_type'].to_list()).issubset(['altref', 'refalt'])
 
     # when keeping ambiguous and flipping alleles:
     #   2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip'
     # flipped matches should be dropped for ambiguous matches
-    flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False)\
+    flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False)\
         .filter(pl.col('ambiguous') == True))
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'altref'})
@@ -62,12 +62,12 @@ def test_match_strategies(small_scorefile, small_target):
 def test_no_oa_match(small_scorefile_no_oa, small_target):
     scorefile, target = _cast_cat(small_scorefile_no_oa, small_target)
 
-    df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True)
+    df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True, keep_first_match=False)
     assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
     assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref'])
 
     # one of the matches is ambiguous
-    flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False)
+    flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False)
             .filter(pl.col('ambiguous') == True))
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'no_oa_alt'})
@@ -76,14 +76,14 @@ def test_no_oa_match(small_scorefile_no_oa, small_target):
 def test_flip_match(small_flipped_scorefile, small_target):
     scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
 
-    df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True)
+    df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True, keep_first_match=False)
     assert df.is_empty()
 
-    flip = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=False)
+    flip = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=False, keep_first_match=False)
     assert flip['match_type'].str.contains('flip').all()
     assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
 
-    flip_ambig = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False)
+    flip_ambig = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False)
                   .filter(pl.col('ambiguous') == True))
     assert not flip_ambig['match_type'].str.contains('flip').any()  # no flip matches for ambiguous
 

From b190429d497e3fad9815a264a9441259eadae496 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Aug 2022 15:16:46 +0100
Subject: [PATCH 14/59] fix liftover

---
 pgscatalog_utils/scorefile/liftover.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index ee8902e..df9f5e1 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -25,9 +25,9 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st
     if to_liftover.empty:
         logger.debug("Liftover skipped because no variants required it")
     else:
-        lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) # loads chain files
+        lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir)  # loads chain files
         logger.debug("Lifting over scoring files")
-        to_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1)
+        to_liftover[['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1)
         logger.debug("Liftover complete")
 
         mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)]

From dca90de499c72c9055a40298029bc9aa855ed0d7 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Aug 2022 15:17:23 +0100
Subject: [PATCH 15/59] fix reading plain text filesc

---
 pgscatalog_utils/scorefile/read.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py
index a3e6997..61d6d44 100644
--- a/pgscatalog_utils/scorefile/read.py
+++ b/pgscatalog_utils/scorefile/read.py
@@ -1,14 +1,18 @@
 import os
+from typing import Tuple
+
 import pandas as pd
 import logging
 
 import gzip
 import io
 
+from pandas import DataFrame
+
 logger = logging.getLogger(__name__)
 
 
-def load_scorefile(path: str) -> pd.DataFrame:
+def load_scorefile(path: str) -> tuple[dict, pd.DataFrame]:
     logger.debug(f'Reading scorefile {path}')
     return (_read_header(path),
             pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False)
@@ -18,8 +22,9 @@ def load_scorefile(path: str) -> pd.DataFrame:
 
 def _read_header(path: str) -> dict:
     """Parses the header of a PGS Catalog format scorefle into a dictionary"""
+    f = io.TextIOWrapper(gzip.open(path, 'r'))
     try:
-        f = io.TextIOWrapper(gzip.open(path, 'r'))
+        f.readline()
     except gzip.BadGzipFile:
         f = open(path, 'r')
 
@@ -42,6 +47,7 @@ def _read_header(path: str) -> dict:
     f.close()
     return header
 
+
 def _scorefile_dtypes() -> dict[str]:
     """ Data types for columns that might be found in a scorefile """
     return {'rsID': str, 'chr_name': str, 'chr_position': pd.UInt64Dtype(), 'effect_allele': 'str',
@@ -69,4 +75,4 @@ def _get_basename(path: str) -> str:
     'HmVCF Date': 'HmVCF_date',
     'HmVCF N Matched Variants': 'HmVCF_n_matched',
     'HmVCF N Unmapped Variants': 'HmVCF_n_unmapped'
-}  # Used to maintain reverse compatibility to old scoring files
\ No newline at end of file
+}  # Used to maintain reverse compatibility to old scoring files

From d198b9cd2557d4e2b11a503efa744f5793b8df2b Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Aug 2022 15:17:59 +0100
Subject: [PATCH 16/59] fix tests

---
 conftest.py           |  8 ++++----
 tests/test_combine.py | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/conftest.py b/conftest.py
index a30f2cd..96b33bd 100644
--- a/conftest.py
+++ b/conftest.py
@@ -66,7 +66,7 @@ def mini_score_path(tmp_path_factory):
 def mini_scorefile(mini_score_path, tmp_path_factory):
     # The mini scorefile overlaps well with cineca synthetic subset
     out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]
+    args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())]
 
     with patch('sys.argv', args):
         combine_scorefiles()
@@ -78,7 +78,7 @@ def mini_scorefile(mini_score_path, tmp_path_factory):
 def combined_scorefile(scorefiles, tmp_path_factory):
     # The combined scorefile overlaps poorly with cineca synthetic subset
     out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
+    args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
 
     with patch('sys.argv', args):
         combine_scorefiles()
@@ -111,9 +111,9 @@ def chain_files(db, tmp_path_factory):
 
 
 @pytest.fixture(scope="session")
-def lifted_scorefiles(scorefiles, chain_files, tmp_path_factory):
+def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
     out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + scorefiles + ['--liftover', '-c', chain_files, '-t', 'GRCh38',
+    args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t', 'GRCh38',
                                                                    '-m', '0.8'] + ['-o', str(out_path.resolve())]
 
     with patch('sys.argv', args):
diff --git a/tests/test_combine.py b/tests/test_combine.py
index f9ee7a1..c76bcdc 100644
--- a/tests/test_combine.py
+++ b/tests/test_combine.py
@@ -1,8 +1,11 @@
+from unittest.mock import patch
+
 import pandas as pd
 import pytest
 import jq
 
 from pgscatalog_utils.download.score import query_score
+from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
 
 
 def test_combine_scorefiles(combined_scorefile, _n_variants):
@@ -15,7 +18,16 @@ def test_combine_scorefiles(combined_scorefile, _n_variants):
 
 def test_liftover(lifted_scorefiles):
     df = pd.read_table(lifted_scorefiles)
-    assert df.shape[0] > 50000  # approx size
+    assert df.shape[0] == 832  # approx size
+
+
+def test_fail_combine(scorefiles, tmp_path_factory):
+    # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception
+    with pytest.raises(Exception):
+        out_path = tmp_path_factory.mktemp("scores") / "combined.txt"
+        args: list[str] = ['combine_scorefiles', '-t', 'GRCh38', '-s'] + scorefiles + ['-o', str(out_path.resolve())]
+        with patch('sys.argv', args):
+            combine_scorefiles()
 
 
 @pytest.fixture
@@ -23,3 +35,4 @@ def _n_variants(pgs_accessions):
     json = query_score(pgs_accessions)
     n: list[int] = jq.compile("[.results][][].variants_number").input(json).all()
     return sum(n)
+

From df6b8699f7e868789e4cbe0e9016cc918600333e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Aug 2022 15:18:17 +0100
Subject: [PATCH 17/59] fix calling _parse_args()

---
 .../scorefile/combine_scorefiles.py           | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 9836cfe..6b27641 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -15,6 +15,7 @@
 from pgscatalog_utils.scorefile.liftover import liftover
 from pgscatalog_utils.scorefile.write import write_scorefile
 
+
 def combine_scorefiles():
     args = _parse_args()
 
@@ -73,16 +74,11 @@ def combine_scorefiles():
     write_scorefile(scorefiles, args.outfile)
 
 
-
-if __name__ == "__main__":
-    combine_scorefiles()
-
-
 def _description_text() -> str:
     return textwrap.dedent('''\
     Combine multiple scoring files in PGS Catalog format (see https://www.pgscatalog.org/downloads/ 
     for details) to a 'long' table of columns needed for variant matching and subsequent calculation. 
-    
+
     Custom scorefiles in PGS Catalog format can be combined with PGS Catalog scoring files, and 
     optionally liftover genomic coordinates to GRCh37 or GRCh38. The script can accept a mix of
     unharmonised and harmonised PGS Catalog data. By default all variants are output (including 
@@ -106,15 +102,15 @@ def _parse_args(args=None) -> argparse.Namespace:
     parser.add_argument('--liftover', dest='liftover',
                         help='<Optional> Convert scoring file variants to target genome build?', action='store_true')
     parser.add_argument('-t', '--target_build', dest='target_build',
-                        choices=['GRCh37', 'GRCh38'], help='Build of target genome',
+                        choices=['GRCh37', 'GRCh38'], help='<Required> Build of target genome',
                         required=True)
     parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files',
                         required="--liftover" in sys.argv)
     parser.add_argument('-m', '--min_lift', dest='min_lift',
-                        help='If liftover, minimum proportion of variants lifted over',
+                        help='<Optional> If liftover, minimum proportion of variants lifted over',
                         required="--liftover" in sys.argv, default=0.95, type=float)
     parser.add_argument('--drop_missing', dest='drop_missing', action='store_true',
-                        help='Drop variants with missing information (chr/pos) and '
+                        help='<Optional> Drop variants with missing information (chr/pos) and '
                              'non-standard alleles (e.g. HLA=P/N) from the output file.')
     parser.add_argument('-o', '--outfile', dest='outfile', required=True,
                         default='combined.txt',
@@ -123,3 +119,8 @@ def _parse_args(args=None) -> argparse.Namespace:
     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
                         help='<Optional> Extra logging information')
     return parser.parse_args(args)
+
+
+if __name__ == "__main__":
+    combine_scorefiles()
+

From 1be72e970b569a5d29ba3deb368a86927ceee310 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Aug 2022 15:46:22 +0100
Subject: [PATCH 18/59] fix test_liftover

---
 conftest.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/conftest.py b/conftest.py
index 96b33bd..4bde081 100644
--- a/conftest.py
+++ b/conftest.py
@@ -123,15 +123,11 @@ def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
 
 
 @pytest.fixture(scope="session")
-def hg38_coords(tmp_path_factory):
-    out_path = tmp_path_factory.mktemp("dummy") / "hg38.txt"
+def hg38_coords():
     d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]}
     df = pd.DataFrame(d)
-    with open(out_path, 'w') as f:
-        f.write('#genome_build=GRCh38\n')
-    df.to_csv(out_path, mode='a', index=False)
-    df['filename'] = str(out_path.resolve())
     df['accession'] = 'dummy'
+    df['genome_build'] = 'GRCh38'
     return df
 
 

From 258db50d432a695d789b0f4aef27801b7a41728f Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Aug 2022 15:49:18 +0100
Subject: [PATCH 19/59] reformat and optimise imports

---
 .github/workflows/main.yml                    |  2 +-
 conftest.py                                   | 19 ++++++-----
 pgscatalog_utils/download/publication.py      |  6 ++--
 pgscatalog_utils/download/score.py            | 10 +++---
 pgscatalog_utils/download/trait.py            |  3 +-
 pgscatalog_utils/log_config.py                |  2 +-
 pgscatalog_utils/match/match.py               |  1 -
 pgscatalog_utils/match/match_variants.py      |  3 +-
 pgscatalog_utils/match/postprocess.py         |  3 +-
 pgscatalog_utils/match/preprocess.py          | 32 +++++++++++--------
 pgscatalog_utils/match/write.py               | 14 ++++----
 .../scorefile/combine_scorefiles.py           | 18 +++++------
 pgscatalog_utils/scorefile/effect_type.py     |  3 +-
 pgscatalog_utils/scorefile/effect_weight.py   |  5 ++-
 pgscatalog_utils/scorefile/genome_build.py    |  9 +++---
 pgscatalog_utils/scorefile/harmonised.py      |  3 +-
 pgscatalog_utils/scorefile/liftover.py        | 17 ++++++----
 pgscatalog_utils/scorefile/qc.py              |  7 ++--
 pgscatalog_utils/scorefile/read.py            | 13 +++-----
 pgscatalog_utils/scorefile/write.py           |  5 ++-
 tests/test_combine.py                         |  3 +-
 tests/test_download.py                        |  8 +++--
 tests/test_liftover.py                        |  1 +
 tests/test_match.py                           |  9 +++---
 24 files changed, 105 insertions(+), 91 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6477922..bf0f138 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,5 +1,5 @@
 name: CI
-on: [push]
+on: [ push ]
 
 jobs:
   ci:
diff --git a/conftest.py b/conftest.py
index 4bde081..e322b96 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,12 +1,14 @@
-import pytest
-from unittest.mock import patch
-from pgscatalog_utils.download.download_scorefile import download_scorefile
+import glob
 import os
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
 import requests as req
-from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
 from pysqlar import SQLiteArchive
-import pandas as pd
-import glob
+
+from pgscatalog_utils.download.download_scorefile import download_scorefile
+from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
 
 
 @pytest.fixture(scope="session")
@@ -113,8 +115,9 @@ def chain_files(db, tmp_path_factory):
 @pytest.fixture(scope="session")
 def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory):
     out_path = tmp_path_factory.mktemp("scores") / "lifted.txt"
-    args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t', 'GRCh38',
-                                                                   '-m', '0.8'] + ['-o', str(out_path.resolve())]
+    args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t',
+                                                                          'GRCh38',
+                                                                          '-m', '0.8'] + ['-o', str(out_path.resolve())]
 
     with patch('sys.argv', args):
         combine_scorefiles()
diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py
index b5e90fa..843b8a2 100644
--- a/pgscatalog_utils/download/publication.py
+++ b/pgscatalog_utils/download/publication.py
@@ -1,7 +1,8 @@
-import requests
 import logging
 from functools import reduce
 
+import requests
+
 logger = logging.getLogger(__name__)
 
 
@@ -17,6 +18,3 @@ def query_publication(pgp: str) -> list[str]:
     pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids')
     logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}")
     return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values()))
-
-
-
diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py
index 61a0154..a38dc0c 100644
--- a/pgscatalog_utils/download/score.py
+++ b/pgscatalog_utils/download/score.py
@@ -1,8 +1,9 @@
-import requests
 import logging
-import jq
 import sys
 
+import jq
+import requests
+
 logger = logging.getLogger(__name__)
 
 
@@ -36,7 +37,7 @@ def query_score(pgs_id: list[str]) -> dict:
 
 def _chunker(pgs: list[str]):
     size = 50  # /rest/score/{pgs_id} limit when searching multiple IDs
-    return(pgs[pos: pos + size] for pos in range(0, len(pgs), size))
+    return (pgs[pos: pos + size] for pos in range(0, len(pgs), size))
 
 
 def _parse_json_query(json: dict, build: str | None) -> dict[str, str]:
@@ -53,5 +54,6 @@ def _extract_ftp_url(json: list[dict], build: str | None) -> dict[str, str]:
         result: list[str] = jq.compile(f'[.results][][].ftp_scoring_file').input(
             json).all()
     else:
-        result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(json).all()
+        result: list[str] = jq.compile(f'[.results][][].ftp_harmonized_scoring_files.{build}.positions').input(
+            json).all()
     return dict(zip(id, [x.replace('https', 'ftp') for x in result]))
diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py
index 981b40d..c2db495 100644
--- a/pgscatalog_utils/download/trait.py
+++ b/pgscatalog_utils/download/trait.py
@@ -1,7 +1,8 @@
-import requests
 import logging
 from functools import reduce
 
+import requests
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/pgscatalog_utils/log_config.py b/pgscatalog_utils/log_config.py
index f1509a9..dcd9cbe 100644
--- a/pgscatalog_utils/log_config.py
+++ b/pgscatalog_utils/log_config.py
@@ -12,4 +12,4 @@ def set_logging_level(verbose: bool):
     else:
         logging.basicConfig(level=logging.WARNING,
                             format=log_fmt,
-                            datefmt='%Y-%m-%d %H:%M:%S')
\ No newline at end of file
+                            datefmt='%Y-%m-%d %H:%M:%S')
diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 6a3f70c..927327c 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -66,7 +66,6 @@ def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap
             .join(pass_df, on='accession', how='left'))
 
 
-
 def _match_keys():
     return ['chr_name', 'chr_position', 'effect_allele', 'other_allele',
             'accession', 'effect_type', 'effect_weight']
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 0d31da6..c2a3381 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -35,7 +35,8 @@ def match_variants():
         match match_mode:
             case "single":
                 logger.debug(f"Match mode: {match_mode}")
-                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous, args.skip_flip)
+                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous,
+                                               args.skip_flip)
             case "multi":
                 logger.debug(f"Match mode: {match_mode}")
                 matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic,
diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
index 33a0220..13ef74c 100644
--- a/pgscatalog_utils/match/postprocess.py
+++ b/pgscatalog_utils/match/postprocess.py
@@ -1,6 +1,7 @@
+import logging
 from functools import reduce
+
 import polars as pl
-import logging
 
 from pgscatalog_utils.match.preprocess import complement_valid_alleles
 
diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index 3cc66f7..0b073fc 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -1,6 +1,7 @@
-import polars as pl
 import logging
 
+import polars as pl
+
 logger = logging.getLogger(__name__)
 
 
@@ -12,16 +13,16 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF
         new_col = col + '_FLIP'
         df = df.with_column(
             pl.when(pl.col(col).str.contains('^[ACGT]+$'))
-                .then(pl.col(col).str.replace_all("A", "V")
-                           .str.replace_all("T", "X")
-                           .str.replace_all("C", "Y")
-                           .str.replace_all("G", "Z")
-                           .str.replace_all("V", "T")
-                           .str.replace_all("X", "A")
-                           .str.replace_all("Y", "G")
-                           .str.replace_all("Z", "C"))
-                .otherwise(pl.col(col))
-                .alias(new_col)
+            .then(pl.col(col).str.replace_all("A", "V")
+                  .str.replace_all("T", "X")
+                  .str.replace_all("C", "Y")
+                  .str.replace_all("G", "Z")
+                  .str.replace_all("V", "T")
+                  .str.replace_all("X", "A")
+                  .str.replace_all("Y", "G")
+                  .str.replace_all("Z", "C"))
+            .otherwise(pl.col(col))
+            .alias(new_col)
         )
     return df
 
@@ -53,11 +54,14 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool)
 
 def check_weights(df: pl.DataFrame) -> None:
     """ Checks weights for scoring file variants that could be matched (e.g. have a chr & pos) """
-    weight_count = df.filter(pl.col('chr_name').is_not_null() & pl.col('chr_position').is_not_null()).groupby(['accession', 'chr_name', 'chr_position', 'effect_allele']).count()
+    weight_count = df.filter(pl.col('chr_name').is_not_null() & pl.col('chr_position').is_not_null()).groupby(
+        ['accession', 'chr_name', 'chr_position', 'effect_allele']).count()
     if any(weight_count['count'] > 1):
-        logger.error("Multiple effect weights per variant per accession detected in files: {}".format(list(weight_count.filter(pl.col('count') > 1)['accession'].unique())))
+        logger.error("Multiple effect weights per variant per accession detected in files: {}".format(
+            list(weight_count.filter(pl.col('count') > 1)['accession'].unique())))
         raise Exception
 
 
 def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame:
-    df.with_column(pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic'))
\ No newline at end of file
+    df.with_column(
+        pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic'))
diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py
index 110e308..1935bd5 100644
--- a/pgscatalog_utils/match/write.py
+++ b/pgscatalog_utils/match/write.py
@@ -1,7 +1,8 @@
-import polars as pl
 import logging
 import os
 
+import polars as pl
+
 logger = logging.getLogger(__name__)
 
 
@@ -56,9 +57,10 @@ def _format_scorefile(df: pl.DataFrame, split: bool) -> dict[str, pl.DataFrame]:
                 for x in chroms}
     else:
         logger.debug("Split output not requested")
-        formatted: pl.DataFrame = (df.pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession")
-                                   .rename({"matched_effect_allele": "effect_allele"})
-                                   .fill_null(strategy="zero"))
+        formatted: pl.DataFrame = (
+            df.pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession")
+            .rename({"matched_effect_allele": "effect_allele"})
+            .fill_null(strategy="zero"))
         return {'false': formatted}
 
 
@@ -87,8 +89,8 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra
     # 2. use cumcount to number duplicate IDs
     # 3. join cumcount data on original DF, use this data for splitting
     ea_count: pl.DataFrame = (df.select(["ID", "effect_allele"])
-        .distinct()
-        .with_columns([
+    .distinct()
+    .with_columns([
         pl.col("ID").cumcount().over(["ID"]).alias("cumcount"),
         pl.col("ID").count().over(["ID"]).alias("count")
     ]))
diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 6b27641..6efeb51 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -6,13 +6,13 @@
 import pandas as pd
 
 from pgscatalog_utils.log_config import set_logging_level
-from pgscatalog_utils.scorefile.read import load_scorefile
-from pgscatalog_utils.scorefile.harmonised import remap_harmonised
-from pgscatalog_utils.scorefile.qc import quality_control
-from pgscatalog_utils.scorefile.genome_build import build2GRC
 from pgscatalog_utils.scorefile.effect_type import set_effect_type
 from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights
+from pgscatalog_utils.scorefile.genome_build import build2GRC
+from pgscatalog_utils.scorefile.harmonised import remap_harmonised
 from pgscatalog_utils.scorefile.liftover import liftover
+from pgscatalog_utils.scorefile.qc import quality_control
+from pgscatalog_utils.scorefile.read import load_scorefile
 from pgscatalog_utils.scorefile.write import write_scorefile
 
 
@@ -38,14 +38,15 @@ def combine_scorefiles():
                 use_harmonised = True
                 current_build = h.get('HmPOS_build')
             else:
-                logger.error(f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}")
+                logger.error(
+                    f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}")
                 raise Exception
 
         # Process/QC score and check variant columns
         score = (score.pipe(remap_harmonised, use_harmonised=True)
-                  .pipe(quality_control, drop_missing=args.drop_missing)
-                  .pipe(melt_effect_weights)
-                  .pipe(set_effect_type))
+                 .pipe(quality_control, drop_missing=args.drop_missing)
+                 .pipe(melt_effect_weights)
+                 .pipe(set_effect_type))
 
         # Annotate score with the genome_build (in GRCh notation)
         if current_build is None:
@@ -123,4 +124,3 @@ def _parse_args(args=None) -> argparse.Namespace:
 
 if __name__ == "__main__":
     combine_scorefiles()
-
diff --git a/pgscatalog_utils/scorefile/effect_type.py b/pgscatalog_utils/scorefile/effect_type.py
index 78bce7f..50c8c73 100644
--- a/pgscatalog_utils/scorefile/effect_type.py
+++ b/pgscatalog_utils/scorefile/effect_type.py
@@ -1,6 +1,7 @@
-import pandas as pd
 import logging
 
+import pandas as pd
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/pgscatalog_utils/scorefile/effect_weight.py b/pgscatalog_utils/scorefile/effect_weight.py
index 2693ec6..4b95e0f 100644
--- a/pgscatalog_utils/scorefile/effect_weight.py
+++ b/pgscatalog_utils/scorefile/effect_weight.py
@@ -1,5 +1,6 @@
-import re
 import logging
+import re
+
 import pandas as pd
 
 logger = logging.getLogger(__name__)
@@ -46,5 +47,3 @@ def _melt(df: pd.DataFrame) -> pd.DataFrame:
     """ Melt a multiple effect weight format """
     ew_cols: list[str] = df.filter(regex="effect_weight_*").columns.to_list()
     return df.melt(value_vars=ew_cols, value_name="effect_weight", var_name="accession")
-
-
diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py
index 5fe4488..06c3141 100644
--- a/pgscatalog_utils/scorefile/genome_build.py
+++ b/pgscatalog_utils/scorefile/genome_build.py
@@ -1,4 +1,5 @@
 import logging
+
 import pandas as pd
 
 from pgscatalog_utils.scorefile.read import _read_header
@@ -14,17 +15,18 @@ def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame:
     df = df.assign(chain_genome_build=[build_dict[x] for x in df['genome_build']])
     return df
 
+
 def build2GRC(build):
     """Map build names so they can be compared with GRCh37 and 38"""
-    build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', 'hg38': 'GRCh38'}  # standardise build names
+    build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37',
+                        'hg38': 'GRCh38'}  # standardise build names
     if pd.isnull(build):
         return None
     else:
         return build_2_GRC_dict.get(build)
 
 
-
-def _read_build(path: str) -> str:
+def _read_build(path: str) -> dict[str, str]:
     """ Open scorefiles and automatically handle compressed input """
     logger.debug(f'Reading header of {path}')
     h = _read_header(path)
@@ -38,4 +40,3 @@ def _get_builds(paths: list) -> pd.DataFrame:
         | x_hmPOS_GRCh37.txt.gz |    | x_hmPOS_GRCh37.txt.gz | hg19         | GRCh37      |
     """
     return pd.DataFrame.from_dict({path: _read_build(path) for path in paths}, orient='index')
-
diff --git a/pgscatalog_utils/scorefile/harmonised.py b/pgscatalog_utils/scorefile/harmonised.py
index bc9c329..b56fb93 100644
--- a/pgscatalog_utils/scorefile/harmonised.py
+++ b/pgscatalog_utils/scorefile/harmonised.py
@@ -1,5 +1,6 @@
-import re
 import logging
+import re
+
 import pandas as pd
 
 logger = logging.getLogger(__name__)
diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index df9f5e1..8dfcdd6 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -1,8 +1,10 @@
+import logging
+import os
+
 import pandas as pd
 import pyliftover
-import os
-import logging
-from .genome_build import annotate_build
+
+from pgscatalog_utils.scorefile.genome_build import annotate_build
 
 logger = logging.getLogger(__name__)
 
@@ -19,7 +21,8 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st
         logger.debug("Liftover required for all scorefile variants")
     else:
         logger.debug("Skipping liftover for scorefiles with same build as target genome")
-        no_liftover.loc[:,['lifted_chr', 'lifted_pos']] = no_liftover[['chr_name', 'chr_position']]  # assume col structure
+        no_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = no_liftover[
+            ['chr_name', 'chr_position']]  # assume col structure
         no_liftover.assign(liftover=None)
 
     if to_liftover.empty:
@@ -32,7 +35,7 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st
 
         mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)]
                                 .assign(liftover=True))
-        unmapped: pd.DataFrame = (to_liftover[to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)]\
+        unmapped: pd.DataFrame = (to_liftover[to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] \
                                   .assign(liftover=False))
         _check_min_liftover(mapped, unmapped, min_lift)
 
@@ -45,7 +48,7 @@ def _check_min_liftover(mapped: pd.DataFrame, unmapped: pd.DataFrame, min_lift:
     n_variants: pd.DataFrame = (pd.DataFrame(df.groupby('accession')['liftover'].count())
                                 .reset_index()
                                 .rename({'liftover': 'n_var'}, axis=1))
-    lo_counts = (pd.DataFrame(df.groupby(['accession', 'liftover'])['liftover'].count())\
+    lo_counts = (pd.DataFrame(df.groupby(['accession', 'liftover'])['liftover'].count()) \
                  .rename_axis(['accession', 'liftover_status'])
                  .reset_index())
     summary: pd.DataFrame = lo_counts.merge(n_variants, on='accession')
@@ -91,7 +94,7 @@ def _parse_lifted_chrom(i: str) -> str:
 def _create_liftover(chain_dir: str) -> dict['str': pyliftover.LiftOver]:
     """ Create LiftOver objects that can remap genomic coordinates """
     builds: list[str] = ["hg19hg38", "hg38hg19"]
-    chains: list[str] = [os.path.join(chain_dir, x) for x in ["hg19ToHg38.over.chain.gz",  "hg38ToHg19.over.chain.gz"]]
+    chains: list[str] = [os.path.join(chain_dir, x) for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"]]
     lo: list[pyliftover.LiftOver] = [pyliftover.LiftOver(x) for x in chains]
     logger.debug("Chain files loaded for liftover")
     return dict(zip(builds, lo))
diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py
index 36b20c0..f88636d 100644
--- a/pgscatalog_utils/scorefile/qc.py
+++ b/pgscatalog_utils/scorefile/qc.py
@@ -1,6 +1,7 @@
-import pandas as pd
 import logging
 
+import pandas as pd
+
 logger = logging.getLogger(__name__)
 
 
@@ -75,9 +76,10 @@ def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame:
         u_count = u_count > 1
         u_count.name = 'is_duplicated'
         df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True)
-        df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos
+        df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False  # handles variants with null chr/pos
         return df
 
+
 def _check_shape(df: pd.DataFrame) -> None:
     assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)"
     assert df.shape[0] > 0, "ERROR: No variants detected in input file (0 rows)"
@@ -88,4 +90,3 @@ def _check_columns(df: pd.DataFrame) -> None:
                                                               "using PGS Catalog files with rsIDs you should request " \
                                                               "harmonised data files (HmPOS) instead."
     assert 'effect_allele' in df, "ERROR: Missing effect allele column"
-
diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py
index 61d6d44..d5c2b39 100644
--- a/pgscatalog_utils/scorefile/read.py
+++ b/pgscatalog_utils/scorefile/read.py
@@ -1,13 +1,9 @@
-import os
-from typing import Tuple
-
-import pandas as pd
-import logging
-
 import gzip
 import io
+import logging
+import os
 
-from pandas import DataFrame
+import pandas as pd
 
 logger = logging.getLogger(__name__)
 
@@ -59,6 +55,7 @@ def _get_basename(path: str) -> str:
     """ Return the basename of a scoring file without extension """
     return os.path.basename(path).split('.')[0]
 
+
 remap_header = {
     'PGS ID': 'pgs_id',
     'PGS Name': 'pgs_name',
@@ -70,7 +67,7 @@ def _get_basename(path: str) -> str:
     'LICENSE': 'license',
     # Harmonization related
     'HmPOS Build': 'HmPOS_build',
-    'HmPOS Date':'HmPOS_date',
+    'HmPOS Date': 'HmPOS_date',
     'HmVCF Reference': 'HmVCF_ref',
     'HmVCF Date': 'HmVCF_date',
     'HmVCF N Matched Variants': 'HmVCF_n_matched',
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index f9762b1..3f23830 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -1,6 +1,6 @@
-import pandas as pd
 import logging
-import sqlite3
+
+import pandas as pd
 
 logger = logging.getLogger(__name__)
 
@@ -34,4 +34,3 @@ def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame:
         return df.query('liftover == True')
     else:
         return df
-
diff --git a/tests/test_combine.py b/tests/test_combine.py
index c76bcdc..ae7de87 100644
--- a/tests/test_combine.py
+++ b/tests/test_combine.py
@@ -1,8 +1,8 @@
 from unittest.mock import patch
 
+import jq
 import pandas as pd
 import pytest
-import jq
 
 from pgscatalog_utils.download.score import query_score
 from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
@@ -35,4 +35,3 @@ def _n_variants(pgs_accessions):
     json = query_score(pgs_accessions)
     n: list[int] = jq.compile("[.results][][].variants_number").input(json).all()
     return sum(n)
-
diff --git a/tests/test_download.py b/tests/test_download.py
index 611740e..13fdeeb 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -1,11 +1,12 @@
 import os
-import pytest
 from unittest.mock import patch
 
-from pgscatalog_utils.download.trait import query_trait
+import pytest
+
+from pgscatalog_utils.download.download_scorefile import download_scorefile
 from pgscatalog_utils.download.publication import query_publication
 from pgscatalog_utils.download.score import get_url
-from pgscatalog_utils.download.download_scorefile import download_scorefile
+from pgscatalog_utils.download.trait import query_trait
 
 
 @pytest.fixture(params=[["PGS000001"], ["PGS000001", "PGS000802"]])
@@ -32,6 +33,7 @@ def test_download_scorefile_author(tmp_path):
         download_scorefile()
         assert os.listdir(out_dir) == ['PGS000001.txt.gz']
 
+
 def test_download_scorefile_hmPOS(tmp_path):
     out_dir = str(tmp_path.resolve())
     args: list[str] = ['download_scorefiles', '-i', 'PGS000001', '-b', 'GRCh38', '-o', out_dir]
diff --git a/tests/test_liftover.py b/tests/test_liftover.py
index 66ebac5..b2f03a0 100644
--- a/tests/test_liftover.py
+++ b/tests/test_liftover.py
@@ -1,4 +1,5 @@
 import pandas as pd
+
 from pgscatalog_utils.scorefile.liftover import liftover
 
 
diff --git a/tests/test_match.py b/tests/test_match.py
index 6f3394d..717adfb 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -1,5 +1,5 @@
-import os
 from unittest.mock import patch
+
 import polars as pl
 import pytest
 
@@ -53,8 +53,8 @@ def test_match_strategies(small_scorefile, small_target):
     # when keeping ambiguous and flipping alleles:
     #   2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip'
     # flipped matches should be dropped for ambiguous matches
-    flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False)\
-        .filter(pl.col('ambiguous') == True))
+    flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False) \
+            .filter(pl.col('ambiguous') == True))
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'altref'})
 
@@ -62,7 +62,7 @@ def test_match_strategies(small_scorefile, small_target):
 def test_no_oa_match(small_scorefile_no_oa, small_target):
     scorefile, target = _cast_cat(small_scorefile_no_oa, small_target)
 
-    df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True)
+    df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True)
     assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
     assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref'])
 
@@ -123,4 +123,3 @@ def small_target():
                          "ALT": ["C", "A", "G"],
                          "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
                          "is_multiallelic": [False, False, False]})
-

From 2bf12201ff582e80723ac7ec5051d14b6b86cb7a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Aug 2022 16:29:33 +0100
Subject: [PATCH 20/59] concat columns instead of setting values directly

---
 pgscatalog_utils/scorefile/liftover.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py
index 8dfcdd6..45258b1 100644
--- a/pgscatalog_utils/scorefile/liftover.py
+++ b/pgscatalog_utils/scorefile/liftover.py
@@ -30,7 +30,8 @@ def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: st
     else:
         lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir)  # loads chain files
         logger.debug("Lifting over scoring files")
-        to_liftover[['lifted_chr', 'lifted_pos']] = to_liftover.apply(lambda x: _convert_coordinates(x, lo), axis=1)
+        lifted: pd.DataFrame = to_liftover.apply(_convert_coordinates, axis=1, lo_dict=lo)
+        to_liftover = pd.concat([to_liftover, lifted], axis=1)
         logger.debug("Liftover complete")
 
         mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)]
@@ -65,6 +66,8 @@ def _check_min_liftover(mapped: pd.DataFrame, unmapped: pd.DataFrame, min_lift:
 
 def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver]) -> pd.Series:
     """ Convert genomic coordinates to different build """
+    converted: list[tuple[str, int, str, int]] | None
+
     if df[['chr_name', 'chr_position']].isnull().values.any():
         converted = None
     else:
@@ -72,14 +75,14 @@ def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver])
         chrom: str = 'chr' + str(df['chr_name'])
         pos: int = int(df['chr_position']) - 1  # liftOver is 0 indexed, VCF is 1 indexed
         # converted example: [('chr22', 15460378, '+', 3320966530)] or None
-        converted: list[tuple[str, int, str, int] | None] = lo.convert_coordinate(chrom, pos)
+        converted = lo.convert_coordinate(chrom, pos)
 
     if converted:
         lifted_chrom: str = _parse_lifted_chrom(converted[0][0][3:])  # return first matching liftover
         lifted_pos: int = int(converted[0][1]) + 1  # reverse 0 indexing
-        return pd.Series([lifted_chrom, lifted_pos])
+        return pd.Series([lifted_chrom, lifted_pos], index=['lifted_chr', 'lifted_pos'])
     else:
-        return pd.Series([None, None])
+        return pd.Series([None, None], index=['lifted_chr', 'lifted_pos'])
 
 
 def _parse_lifted_chrom(i: str) -> str:

From e2e63f94cccb374e809015ab6c84e8ab4d25cdb8 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 31 Aug 2022 16:32:33 +0100
Subject: [PATCH 21/59] remove unused functions

---
 pgscatalog_utils/scorefile/genome_build.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py
index 06c3141..7ea4f09 100644
--- a/pgscatalog_utils/scorefile/genome_build.py
+++ b/pgscatalog_utils/scorefile/genome_build.py
@@ -2,8 +2,6 @@
 
 import pandas as pd
 
-from pgscatalog_utils.scorefile.read import _read_header
-
 logger = logging.getLogger(__name__)
 
 
@@ -24,19 +22,3 @@ def build2GRC(build):
         return None
     else:
         return build_2_GRC_dict.get(build)
-
-
-def _read_build(path: str) -> dict[str, str]:
-    """ Open scorefiles and automatically handle compressed input """
-    logger.debug(f'Reading header of {path}')
-    h = _read_header(path)
-    return {k: h.get(k, None) for k in ('genome_build', 'HmPOS_build')}
-
-
-def _get_builds(paths: list) -> pd.DataFrame:
-    """ Get genome builds for a series of scorefile paths
-        | filename              | -> |                       | genome_build | HmPOS_build |
-        | x.txt.gz              |    | x.txt.gz              | hg19         | None        |
-        | x_hmPOS_GRCh37.txt.gz |    | x_hmPOS_GRCh37.txt.gz | hg19         | GRCh37      |
-    """
-    return pd.DataFrame.from_dict({path: _read_build(path) for path in paths}, orient='index')

From d7168e4652938a128712dae1aa70781bc0d8853e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Sep 2022 09:15:48 +0100
Subject: [PATCH 22/59] Update combine_scorefiles.py

---
 pgscatalog_utils/scorefile/combine_scorefiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 6efeb51..2f4cdd1 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -43,7 +43,7 @@ def combine_scorefiles():
                 raise Exception
 
         # Process/QC score and check variant columns
-        score = (score.pipe(remap_harmonised, use_harmonised=True)
+        score = (score.pipe(remap_harmonised, use_harmonised=use_harmonised)
                  .pipe(quality_control, drop_missing=args.drop_missing)
                  .pipe(melt_effect_weights)
                  .pipe(set_effect_type))

From f75d401b7506081e01cc5d7506c2c1f8a02f739c Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Thu, 1 Sep 2022 10:25:54 +0100
Subject: [PATCH 23/59] Make genome build a required header item for
 combine_scorefiles

---
 pgscatalog_utils/scorefile/combine_scorefiles.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 2f4cdd1..5b30fda 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -51,6 +51,11 @@ def combine_scorefiles():
         # Annotate score with the genome_build (in GRCh notation)
         if current_build is None:
             current_build = build2GRC(h.get('genome_build'))
+            if current_build is None:
+                logger.error("Scorefile has no build information, "
+                             "please add the build to the header with "
+                             "('#genome_build=[insert variant build]")
+                raise Exception
 
         score = score.assign(genome_build=current_build)
 

From a28f4c4b0c821f4d68f0199e75b0bc82525e5fc3 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 1 Sep 2022 14:59:28 +0100
Subject: [PATCH 24/59] bump version

---
 Dockerfile                   | 4 ++--
 pgscatalog_utils/__init__.py | 2 +-
 pyproject.toml               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9e97be8..8c19690 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,8 +11,8 @@ FROM python:3.10
 
 WORKDIR /opt/
 
-COPY --from=builder /app/dist/pgscatalog_utils-0.1.1-py3-none-any.whl .
+COPY --from=builder /app/dist/pgscatalog_utils-0.1.2-py3-none-any.whl .
 
-RUN pip install pgscatalog_utils-0.1.1-py3-none-any.whl
+RUN pip install pgscatalog_utils-0.1.2-py3-none-any.whl
 
 RUN apt-get update && apt-get install -y sqlite3
\ No newline at end of file
diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py
index df9144c..10939f0 100644
--- a/pgscatalog_utils/__init__.py
+++ b/pgscatalog_utils/__init__.py
@@ -1 +1 @@
-__version__ = '0.1.1'
+__version__ = '0.1.2'
diff --git a/pyproject.toml b/pyproject.toml
index 44ef233..acfcb36 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pgscatalog_utils"
-version = "0.1.1"
+version = "0.1.2"
 description = "Utilities for working with PGS Catalog API and scoring files"
 homepage = "https://github.com/PGScatalog/pgscatalog_utils"
 authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>"]

From b982ce669cdbaf7f8454eaffdc56b8f5d47f2c09 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 5 Sep 2022 16:18:54 +0100
Subject: [PATCH 25/59] fix _prioritise_match_type() with flipped match
 strategies

---
 pgscatalog_utils/match/postprocess.py | 50 +++++++++------------------
 1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
index 71da4b6..fae0b0d 100644
--- a/pgscatalog_utils/match/postprocess.py
+++ b/pgscatalog_utils/match/postprocess.py
@@ -54,26 +54,21 @@ def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFr
     :param drop_duplicates: If it's impossible to make match candidates unique, drop all candidates?
     :return: A dataframe containing the best match candidate for each variant
     """
-
-    dups: pl.DataFrame = _get_duplicate_variants(df)
+    logger.debug("First match pruning: prioritise by match types")
+    prioritised = _prioritise_match_type(df)
+    singletons: pl.DataFrame = _get_singleton_variants(prioritised)
+    dups: pl.DataFrame = _get_duplicate_variants(prioritised)
 
     if dups:
-        logger.debug("First match pruning: prioritise by match types")
-        singletons: pl.DataFrame = _get_singleton_variants(df)
-        prioritised: pl.DataFrame = _prioritise_match_type(dups)
-        prioritised_dups: pl.DataFrame = _get_duplicate_variants(prioritised)
-        if prioritised_dups and not keep_first_match:
-            logger.debug("Final match pruning: dropping remaining duplicate matches")
-            distinct: pl.DataFrame = pl.concat([singletons, _get_singleton_variants(prioritised)])
-        elif prioritised_dups and keep_first_match:
+        if keep_first_match:
             logger.debug("Final match pruning: keeping first match")
-            distinct: pl.DataFrame = pl.concat([singletons, _get_singleton_variants(prioritised),
-                                                prioritised.unique(maintain_order=True)])
+            distinct: pl.DataFrame = pl.concat([singletons, dups.unique(maintain_order=True)])
         else:
-            logger.debug("Final match pruning unnecessary")
-            distinct: pl.DataFrame = pl.concat([singletons, prioritised])
+            logger.debug("Final match pruning: dropping remaining duplicate matches")
+            distinct: pl.DataFrame = singletons
     else:
-        distinct: pl.DataFrame = df
+        logger.debug("Final match pruning unnecessary")
+        distinct: pl.DataFrame = singletons
 
     assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant"
     logger.debug("Match pruning complete")
@@ -98,34 +93,23 @@ def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame:
 
 
 def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame:
-    dup_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") != None)
-    dup_no_oa: pl.DataFrame = duplicates.filter(pl.col("other_allele") == None)
-    best_matches: list[pl.DataFrame] = []
-
-    if dup_oa:
-        match_priority: list[str] = ['refalt', 'altref', 'refalt_flip', 'altref_flip']
-        logger.debug(f"Prioritising matches in order {match_priority}")
-        best_matches.append(_get_best_match(dup_oa, match_priority))
-
-    if dup_no_oa:
-        match_priority: list[str] = ['no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip', 'no_oa_alt_flip']
-        logger.debug(f"Prioritising matches in order {match_priority}")
-        best_matches.append(_get_best_match(dup_no_oa, match_priority))
-
-    return pl.concat(best_matches)
+    # first element has the highest priority and last element has the lowest priority
+    match_priority = ['refalt', 'altref', 'refalt_flip', 'altref_flip', 'no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip',
+                      'no_oa_alt_flip']
+    return _get_best_match(duplicates, match_priority)
 
 
 def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame:
     match: list[pl.DataFrame] = []
     for match_type in match_priority:
+        logger.debug(f"Selecting matches with match type {match_type}")
         match.append(df.filter(pl.col("match_type") == match_type))
-    logger.debug("Filtering best match types")
+    logger.debug("Prioritising match types (refalt > altref > ...)")
     return reduce(lambda x, y: _join_best_match(x, y), match)
 
 
 def _join_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame:
     # variants in dataframe x have a higher priority than dataframe y
     # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x
-    not_in: pl.DataFrame = y.join(x, how='anti',
-                                  on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'])
+    not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'ID'])
     return pl.concat([x, not_in])

From 8795a5806e5f5a6218420917611c43678cd85290 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 5 Sep 2022 16:51:25 +0100
Subject: [PATCH 26/59] add other_allele to _get_singleton_variants and
 _get_duplicate_variants

---
 pgscatalog_utils/match/postprocess.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
index fae0b0d..ba42378 100644
--- a/pgscatalog_utils/match/postprocess.py
+++ b/pgscatalog_utils/match/postprocess.py
@@ -78,18 +78,18 @@ def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFr
 
 def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame:
     """ Return variants with only one row (match candidate) per variant ID """
-    return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele'])
+    return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'])
             .count()
-            .filter(pl.col('count') == 1)[:, "accession":"effect_allele"]
-            .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left'))
+            .filter(pl.col('count') == 1)[:, "accession":"other_allele"]
+            .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'], how='left'))
 
 
 def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame:
     """ Return variants with more than one row (match candidate) per variant ID """
-    return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele'])
+    return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'])
             .count()
-            .filter(pl.col('count') > 1)[:, "accession":"effect_allele"]
-            .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele'], how='left'))
+            .filter(pl.col('count') > 1)[:, "accession":"other_allele"]
+            .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'], how='left'))
 
 
 def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame:

From 35a08d9d0f6be9230bad8615d326e8b1fdbf030b Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Mon, 5 Sep 2022 17:29:27 +0100
Subject: [PATCH 27/59] Readability/doc-edits

---
 pgscatalog_utils/match/postprocess.py | 36 +++++++++++++--------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
index ba42378..b7e14b0 100644
--- a/pgscatalog_utils/match/postprocess.py
+++ b/pgscatalog_utils/match/postprocess.py
@@ -41,23 +41,22 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame:
         .otherwise(False)))
 
 
-def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFrame:
+def _prune_matches(df: pl.DataFrame, keep_first_match: bool = False) -> pl.DataFrame:
     """ Select the best match candidate in the target for each variant in the scoring file
 
-    - In a scoring file (accession), each variant ID with the same effect allele and weight *must be unique*
     - The variant matching process normally returns multiple match candidates for each variant ID, e.g.:
         refalt > altref > refalt_flip > altref_flip
     - When multiple match candidates for an ID exist, they must be prioritised and pruned to be unique
     - If it's impossible to prioritise match candidates (i.e. same strategy is used), drop all matches by default
+    - In a scoring file (accession), each variant ID *must be unique* (have only one weight and effect_allele)
 
     :param df: A dataframe containing multiple match candidates for each variant
-    :param drop_duplicates: If it's impossible to make match candidates unique, drop all candidates?
+    :param keep_first_match: If it's impossible to make match candidates unique, keep the first occuring variant?
     :return: A dataframe containing the best match candidate for each variant
     """
     logger.debug("First match pruning: prioritise by match types")
     prioritised = _prioritise_match_type(df)
-    singletons: pl.DataFrame = _get_singleton_variants(prioritised)
-    dups: pl.DataFrame = _get_duplicate_variants(prioritised)
+    singletons, dups = _divide_matches(prioritised)
 
     if dups:
         if keep_first_match:
@@ -70,26 +69,25 @@ def _prune_matches(df: pl.DataFrame, keep_first_match: bool = True) -> pl.DataFr
         logger.debug("Final match pruning unnecessary")
         distinct: pl.DataFrame = singletons
 
-    assert all(distinct.groupby(['accession', 'ID']).count()['count'] == 1), "Duplicate effect weights for a variant"
+    # Final QC check
+    u_counts = distinct.groupby(['accession', 'ID']).count()
+    assert all(u_counts['count'] == 1), "Duplicate effect weights for a variant: {}".format(list(u_counts['accession'].unique()))
+
     logger.debug("Match pruning complete")
 
     return distinct.with_column(pl.lit(True).alias('passes_pruning'))
 
 
-def _get_singleton_variants(df: pl.DataFrame) -> pl.DataFrame:
-    """ Return variants with only one row (match candidate) per variant ID """
-    return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'])
-            .count()
-            .filter(pl.col('count') == 1)[:, "accession":"other_allele"]
-            .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'], how='left'))
-
+def _divide_matches(df: pl.DataFrame) -> tuple [ pl.DataFrame, pl.DataFrame ]:
+    """ Divide score file match candidates with only one row (unique) vs. multiple (duplicates)"""
+    join_cols = ['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele']
+    counted = df.groupby(join_cols).count()
+    singletons = (counted.filter(pl.col('count') == 1)[:, "accession":"other_allele"]
+                         .join(df, on=join_cols, how='left'))
+    duplicates = (counted.filter(pl.col('count') > 1)[:, "accession":"other_allele"]
+                         .join(df, on=join_cols, how='left'))
 
-def _get_duplicate_variants(df: pl.DataFrame) -> pl.DataFrame:
-    """ Return variants with more than one row (match candidate) per variant ID """
-    return (df.groupby(['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'])
-            .count()
-            .filter(pl.col('count') > 1)[:, "accession":"other_allele"]
-            .join(df, on=['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele'], how='left'))
+    return singletons, duplicates
 
 
 def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame:

From 356a479661839470802bbf3676bc7fd8f154b604 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Mon, 5 Sep 2022 18:21:47 +0100
Subject: [PATCH 28/59] Number lines w/in an accession

---
 pgscatalog_utils/scorefile/read.py  | 5 ++---
 pgscatalog_utils/scorefile/write.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py
index d5c2b39..14cb52d 100644
--- a/pgscatalog_utils/scorefile/read.py
+++ b/pgscatalog_utils/scorefile/read.py
@@ -10,10 +10,9 @@
 
 def load_scorefile(path: str) -> tuple[dict, pd.DataFrame]:
     logger.debug(f'Reading scorefile {path}')
+    df = pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False)
     return (_read_header(path),
-            pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False)
-            .assign(filename_prefix=_get_basename(path),
-                    filename=path))
+            df.assign(filename_prefix=_get_basename(path), filename=path, row_nr=df.index))
 
 
 def _read_header(path: str) -> dict:
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 3f23830..0dd7b38 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -7,7 +7,7 @@
 
 def write_scorefile(df: pd.DataFrame, path: str) -> None:
     cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
-                       'is_duplicated', 'accession']
+                       'is_duplicated', 'accession', 'row_nr']
 
     if df.empty:
         logger.error("Empty scorefile output! Please check the input data")

From 2503cb62407b3edde0993d918289676ae959f3c1 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Mon, 5 Sep 2022 18:34:19 +0100
Subject: [PATCH 29/59] Use row_nr to priortise matches, and ID to de-duplicate
 scoring files

---
 pgscatalog_utils/match/match.py       |  2 +-
 pgscatalog_utils/match/postprocess.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index ac566b2..6da2e9c 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -15,7 +15,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu
     scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None)
 
     matches: list[pl.DataFrame] = []
-    col_order = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
+    col_order = ['row_nr', 'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
                  'accession', 'effect_allele_FLIP', 'other_allele_FLIP',
                  'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type']
 
diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
index b7e14b0..7676902 100644
--- a/pgscatalog_utils/match/postprocess.py
+++ b/pgscatalog_utils/match/postprocess.py
@@ -79,22 +79,22 @@ def _prune_matches(df: pl.DataFrame, keep_first_match: bool = False) -> pl.DataF
 
 
 def _divide_matches(df: pl.DataFrame) -> tuple [ pl.DataFrame, pl.DataFrame ]:
-    """ Divide score file match candidates with only one row (unique) vs. multiple (duplicates)"""
-    join_cols = ['accession', 'chr_name', 'chr_position', 'effect_allele', 'other_allele']
+    """ Divide scorefile (accession) matches with only one ID match (singletons) vs. multiple (duplicates)"""
+    join_cols = ['accession', 'ID']
     counted = df.groupby(join_cols).count()
-    singletons = (counted.filter(pl.col('count') == 1)[:, "accession":"other_allele"]
+    singletons = (counted.filter(pl.col('count') == 1)[:, join_cols]
                          .join(df, on=join_cols, how='left'))
-    duplicates = (counted.filter(pl.col('count') > 1)[:, "accession":"other_allele"]
+    duplicates = (counted.filter(pl.col('count') > 1)[:, join_cols]
                          .join(df, on=join_cols, how='left'))
 
     return singletons, duplicates
 
 
-def _prioritise_match_type(duplicates: pl.DataFrame) -> pl.DataFrame:
-    # first element has the highest priority and last element has the lowest priority
+def _prioritise_match_type(all_matches: pl.DataFrame) -> pl.DataFrame:
+    # Select best match for each row in the scoring file
     match_priority = ['refalt', 'altref', 'refalt_flip', 'altref_flip', 'no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip',
                       'no_oa_alt_flip']
-    return _get_best_match(duplicates, match_priority)
+    return _get_best_match(all_matches, match_priority)
 
 
 def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame:
@@ -109,5 +109,5 @@ def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame
 def _join_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame:
     # variants in dataframe x have a higher priority than dataframe y
     # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x
-    not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'ID'])
+    not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'row_nr'])
     return pl.concat([x, not_in])

From 13814b53d4baa61c259a31c6227d42a8d5a2b8f0 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 6 Sep 2022 16:56:23 +0100
Subject: [PATCH 30/59] compress log output

---
 pgscatalog_utils/match/log.py   | 19 +++++++++++++++++++
 pgscatalog_utils/match/write.py |  4 ----
 2 files changed, 19 insertions(+), 4 deletions(-)
 create mode 100644 pgscatalog_utils/match/log.py

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
new file mode 100644
index 0000000..13085cb
--- /dev/null
+++ b/pgscatalog_utils/match/log.py
@@ -0,0 +1,19 @@
+import gzip
+import logging
+
+import polars as pl
+
+logger = logging.getLogger(__name__)
+
+
+def write_log(df: pl.DataFrame, dataset: str) -> None:
+    logger.debug("Compressing and writing log")
+    with gzip.open(f"{dataset}_log.csv.gz", 'wb') as f:
+        df.pipe(_prettify_log).write_csv(f)
+
+
+def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
+    keep_cols = ["chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type",
+                 "accession", "row_nr", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
+                 "ambiguous", "duplicate", "best_match", "dataset", "score_pass", "match_rate"]
+    return df.select(keep_cols).select(pl.exclude("^.*_right"))
diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py
index 1935bd5..7a8a880 100644
--- a/pgscatalog_utils/match/write.py
+++ b/pgscatalog_utils/match/write.py
@@ -21,10 +21,6 @@ def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None:
     [_write_scorefile(ea_dict.get(k), v, split, outdir, dataset) for k, v in deduplicated.items()]
 
 
-def write_log(df: pl.DataFrame, dataset: str) -> None:
-    df.write_csv(f"{dataset}_log.csv")
-
-
 def _write_scorefile(effect_type: str, scorefiles: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None:
     """ Write a list of scorefiles with the same effect type """
     # each list element contains a dataframe of variants

From 724144d464e1ed4628fd73e0389153cbad36061a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 6 Sep 2022 17:01:10 +0100
Subject: [PATCH 31/59] refactor filtering score and variants to filter module

---
 pgscatalog_utils/match/filter.py         | 91 ++++++++++++++++++++++
 pgscatalog_utils/match/match.py          | 46 +----------
 pgscatalog_utils/match/match_variants.py | 36 ++++-----
 pgscatalog_utils/match/postprocess.py    | 97 +++++++++---------------
 tests/test_match.py                      | 14 ++--
 5 files changed, 153 insertions(+), 131 deletions(-)
 create mode 100644 pgscatalog_utils/match/filter.py

diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py
new file mode 100644
index 0000000..caa9360
--- /dev/null
+++ b/pgscatalog_utils/match/filter.py
@@ -0,0 +1,91 @@
+import logging
+
+import polars as pl
+
+from pgscatalog_utils.match.log import write_log
+
+logger = logging.getLogger(__name__)
+
+
+def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool,
+                  min_overlap: float, dataset: str) -> pl.DataFrame:
+    """ Remove scores that don't match well """
+    scorefile: pl.DataFrame = scorefile.with_columns([
+        pl.col('effect_type').cast(pl.Categorical),
+        pl.col('accession').cast(pl.Categorical)])  # same dtypes for join
+
+    # matches may contain more than one row per variant in the scoring file
+    # e.g., one ambiguous match and one clear match, or duplicates may be in the scoring file
+    filtered_matches: pl.DataFrame = _filter_matches(matches, remove_ambiguous, keep_first_match)
+    match_log: pl.DataFrame = _join_matches(filtered_matches, scorefile, dataset)
+    match_log['best_match'] = match_log['best_match'].fill_null(False)
+
+    fail_rates: pl.DataFrame = _calculate_match_rate(match_log)
+
+    scores: list[pl.DataFrame] = []
+    for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()):
+        if rate < (1 - min_overlap):
+            df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [True], 'match_rate': [1 - rate]})
+            logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%}  variants match)")
+            scores.append(df)
+        else:
+            df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [False], 'match_rate': [1 - rate]})
+            logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)")
+            scores.append(df)
+
+    (match_log.with_column(pl.col('accession').cast(str))
+     .join(pl.concat(scores), on='accession', how='left')).pipe(write_log, dataset)  # write log to gzipped CSV
+
+    return (filtered_matches.with_column(pl.col('accession').cast(str))
+            .join(pl.concat(scores), on='accession', how='left'))
+
+
+def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame:
+    logger.debug("Calculating overlap between target genome and scoring file")
+    return (df.groupby('accession')
+            .agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')])
+            .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate')))
+
+
+def _filter_matches(df: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
+    logger.debug("Final match candidate filtering")
+    return (df.filter(pl.col('best_match') == True)
+            .pipe(_handle_ambiguous, remove_ambiguous)
+            .pipe(_handle_duplicates, keep_first_match))
+
+
+def _handle_ambiguous(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame:
+    if remove_ambiguous:
+        logger.debug("Filtering: Removing ambiguous matches")
+        return df.filter(pl.col("ambiguous") == False)
+    else:
+        logger.debug("Filtering: Keeping best possible match from ambiguous matches")
+        ambiguous: pl.DataFrame = df.filter((pl.col("ambiguous") == True) & \
+                                            (pl.col("match_type").str.contains('flip').is_not()))
+        unambiguous: pl.DataFrame = df.filter(pl.col("ambiguous") == False)
+        return pl.concat([ambiguous, unambiguous])
+
+
+def _handle_duplicates(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame:
+    singletons = df.filter(pl.col('duplicate') == False)
+    if keep_first_match:
+        logger.debug("Filtering: keeping first match")
+        first = (df.filter(pl.col('duplicate') == True)
+                 .groupby(["accession", "ID"])
+                 .agg([pl.col("row_nr").first()])
+                 .join(df, on=['accession', 'row_nr'], how='left'))
+        return pl.concat([singletons, first.select(singletons.columns)])
+    else:
+        logger.debug("Filtering: dropping any duplicate matches")
+        return singletons
+
+
+def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame:
+    return (scorefile.join(matches, on=['accession', 'row_nr'], how='left')
+            .with_column(pl.lit(dataset).alias('dataset'))
+            .select(pl.exclude("^.*_right$")))
+
+
+def _match_keys() -> list[str]:
+    return ['chr_name', 'chr_position', 'effect_allele', 'other_allele',
+            'accession', 'effect_type', 'effect_weight']
diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 6da2e9c..9387146 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -3,13 +3,11 @@
 import polars as pl
 
 from pgscatalog_utils.match.postprocess import postprocess_matches
-from pgscatalog_utils.match.write import write_log
 
 logger = logging.getLogger(__name__)
 
 
-def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambiguous: bool,
-                    skip_flip: bool, keep_first_match: bool) -> pl.DataFrame:
+def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool) -> pl.DataFrame:
     scorefile_cat, target_cat = _cast_categorical(scorefile, target)
     scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None)
     scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None)
@@ -35,47 +33,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, remove_ambigu
             matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order))
             matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order))
 
-    return pl.concat(matches).pipe(postprocess_matches, remove_ambiguous, keep_first_match)
-
-
-def check_match_rate(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, dataset: str) -> pl.DataFrame:
-    scorefile: pl.DataFrame = scorefile.with_columns([
-        pl.col('effect_type').cast(pl.Categorical),
-        pl.col('accession').cast(pl.Categorical)])  # same dtypes for join
-    match_log: pl.DataFrame = _join_matches(matches, scorefile, dataset)
-    fail_rates: pl.DataFrame = (match_log.groupby('accession')
-                                .agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')])
-                                .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate'))
-                                )
-    pass_df: pl.DataFrame = pl.DataFrame()
-    for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()):
-        if rate < (1 - min_overlap):
-            df = pl.DataFrame({'accession': [accession], 'match_pass': [True], 'match_rate': [1 - rate]})
-            pass_df = pl.concat([pass_df, df])
-            logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%}  variants match)")
-        else:
-            df = pl.DataFrame({'accession': [accession], 'match_pass': [False], 'match_rate': [1 - rate]})
-            pass_df = pl.concat([pass_df, df])
-            logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)")
-
-    # TODO: fill nulls in certain columns with false in a nicer way
-    match_log['passes_pruning'] = match_log['passes_pruning'].fill_null(False)
-
-    # add match statistics to log and matches
-    write_log((match_log.with_column(pl.col('accession').cast(str))
-               .join(pass_df, on='accession', how='left')), dataset)
-
-    return (matches.with_column(pl.col('accession').cast(str))
-            .join(pass_df, on='accession', how='left'))
-
-
-def _match_keys():
-    return ['chr_name', 'chr_position', 'effect_allele', 'other_allele',
-            'accession', 'effect_type', 'effect_weight']
-
-
-def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str):
-    return scorefile.join(matches, on=_match_keys(), how='left').with_column(pl.lit(dataset).alias('dataset'))
+    return pl.concat(matches).pipe(postprocess_matches)
 
 
 def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame:
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 5a67dd0..6937a90 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -6,7 +6,8 @@
 import polars as pl
 
 from pgscatalog_utils.log_config import set_logging_level
-from pgscatalog_utils.match.match import get_all_matches, check_match_rate
+from pgscatalog_utils.match.match import get_all_matches
+from pgscatalog_utils.match.filter import filter_scores
 from pgscatalog_utils.match.read import read_target, read_scorefile
 from pgscatalog_utils.match.write import write_out
 
@@ -35,29 +36,28 @@ def match_variants():
         match match_mode:
             case "single":
                 logger.debug(f"Match mode: {match_mode}")
-                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.remove_ambiguous,
-                                               args.skip_flip, args.keep_first_match)
+                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.skip_flip)
             case "multi":
                 logger.debug(f"Match mode: {match_mode}")
                 matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic,
-                                                  args.remove_ambiguous, args.skip_flip, args.keep_first_match)
+                                                  args.skip_flip)
             case "fast":
                 logger.debug(f"Match mode: {match_mode}")
                 matches = _fast_match(args.target, scorefile, args.remove_multiallelic,
-                                      args.remove_ambiguous, args.skip_flip, args.keep_first_match)
+                                      args.skip_flip)
             case _:
                 logger.critical(f"Invalid match mode: {match_mode}")
                 raise Exception
 
         dataset = args.dataset.replace('_', '-')  # underscores are delimiters in pgs catalog calculator
-        valid_matches: pl.DataFrame = (check_match_rate(scorefile, matches, args.min_overlap, dataset)
-                                       .filter(pl.col('match_pass') == True))
+        valid_matches = filter_scores(scorefile, matches, args.remove_ambiguous, args.keep_first_match, args.min_overlap,
+                                      dataset)
 
-    if valid_matches.is_empty():  # this can happen if args.min_overlap = 0
-        logger.error("Error: no target variants match any variants in scoring files")
-        raise Exception
+        if valid_matches.is_empty():  # this can happen if args.min_overlap = 0
+            logger.error("Error: no target variants match any variants in scoring files")
+            raise Exception
 
-    write_out(valid_matches, args.split, args.outdir, dataset)
+        write_out(valid_matches, args.split, args.outdir, dataset)
 
 
 def _check_target_chroms(target) -> None:
@@ -70,37 +70,37 @@ def _check_target_chroms(target) -> None:
 
 
 def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame:
+                skip_filp: bool) -> pl.DataFrame:
     # fast match is fast because:
     #   1) all target files are read into memory
     #   2) matching occurs without iterating through chromosomes
     target: pl.DataFrame = read_target(path=target_path,
                                        remove_multiallelic=remove_multiallelic)
     logger.debug("Split target chromosomes not checked with fast match mode")
-    return get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match)
+    return get_all_matches(scorefile, target, skip_filp)
 
 
 def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                            remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame:
+                            skip_filp: bool) -> pl.DataFrame:
     matches = []
     for i, loc_target_current in enumerate(glob(target_path)):
         logger.debug(f'Matching scorefile(s) against target: {loc_target_current}')
         target: pl.DataFrame = read_target(path=loc_target_current,
-                                           remove_multiallelic=remove_multiallelic)  #
+                                           remove_multiallelic=remove_multiallelic)
         _check_target_chroms(target)
-        matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match))
+        matches.append(get_all_matches(scorefile, target, skip_filp))
     return pl.concat(matches)
 
 
 def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                         remove_ambiguous: bool, skip_filp: bool, keep_first_match: bool) -> pl.DataFrame:
+                         skip_filp: bool) -> pl.DataFrame:
     matches = []
     for chrom in scorefile['chr_name'].unique().to_list():
         target = read_target(target_path, remove_multiallelic=remove_multiallelic,
                              single_file=True, chrom=chrom)  # scans and filters
         if target:
             logger.debug(f"Matching chromosome {chrom}")
-            matches.append(get_all_matches(scorefile, target, remove_ambiguous, skip_filp, keep_first_match))
+            matches.append(get_all_matches(scorefile, target, skip_filp))
 
     return pl.concat(matches)
 
diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
index 7676902..0ae0b2b 100644
--- a/pgscatalog_utils/match/postprocess.py
+++ b/pgscatalog_utils/match/postprocess.py
@@ -8,24 +8,17 @@
 logger = logging.getLogger(__name__)
 
 
-def postprocess_matches(df: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
-    """ Clean up match candidates ready for writing out, including:
+def postprocess_matches(df: pl.DataFrame) -> pl.DataFrame:
+    """ Label match candidates with additional metadata. Column definitions:
 
-    - Label ambiguous variants
-    - Prune match candidates to select the best match for each variant in the scoring file
-    - Optionally remove ambiguous variants
+    - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function)
+    - best_match: True if row is the best possible match type (refalt > altref > ...)
+    - duplicate: True if >1 scoring file line matches to the same variant ID
+    - ambiguous: True if ambiguous
     """
-    df = _label_biallelic_ambiguous(df).pipe(_prune_matches, keep_first_match)
-
-    if remove_ambiguous:
-        logger.debug("Removing ambiguous matches")
-        return df.filter(pl.col("ambiguous") == False)
-    else:
-        logger.debug("Keeping best possible match from ambiguous matches")
-        ambiguous: pl.DataFrame = df.filter((pl.col("ambiguous") == True) & \
-                                            (pl.col("match_type").str.contains('flip').is_not()))
-        unambiguous: pl.DataFrame = df.filter(pl.col("ambiguous") == False)
-        return pl.concat([ambiguous, unambiguous])
+    return (df.with_column(pl.lit(True).alias('match_candidate'))
+            .pipe(_label_biallelic_ambiguous)
+            .pipe(_label_pruned_matches))
 
 
 def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame:
@@ -41,73 +34,53 @@ def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame:
         .otherwise(False)))
 
 
-def _prune_matches(df: pl.DataFrame, keep_first_match: bool = False) -> pl.DataFrame:
-    """ Select the best match candidate in the target for each variant in the scoring file
-
-    - The variant matching process normally returns multiple match candidates for each variant ID, e.g.:
-        refalt > altref > refalt_flip > altref_flip
-    - When multiple match candidates for an ID exist, they must be prioritised and pruned to be unique
-    - If it's impossible to prioritise match candidates (i.e. same strategy is used), drop all matches by default
-    - In a scoring file (accession), each variant ID *must be unique* (have only one weight and effect_allele)
-
-    :param df: A dataframe containing multiple match candidates for each variant
-    :param keep_first_match: If it's impossible to make match candidates unique, keep the first occuring variant?
-    :return: A dataframe containing the best match candidate for each variant
-    """
-    logger.debug("First match pruning: prioritise by match types")
-    prioritised = _prioritise_match_type(df)
-    singletons, dups = _divide_matches(prioritised)
+def _label_pruned_matches(df: pl.DataFrame) -> pl.DataFrame:
+    best_matches = (df.pipe(_label_best_match)
+                    .pipe(_label_duplicates))
 
-    if dups:
-        if keep_first_match:
-            logger.debug("Final match pruning: keeping first match")
-            distinct: pl.DataFrame = pl.concat([singletons, dups.unique(maintain_order=True)])
-        else:
-            logger.debug("Final match pruning: dropping remaining duplicate matches")
-            distinct: pl.DataFrame = singletons
-    else:
-        logger.debug("Final match pruning unnecessary")
-        distinct: pl.DataFrame = singletons
+    # check that duplicates were correctly labelled
+    u_counts = best_matches.filter(pl.col('duplicate') == False).groupby(['accession', 'ID']).count()
+    assert (u_counts['count'] == 1).all(), \
+        "Duplicate effect weights for a variant: {}".format(list(u_counts['accession'].unique()))
 
-    # Final QC check
-    u_counts = distinct.groupby(['accession', 'ID']).count()
-    assert all(u_counts['count'] == 1), "Duplicate effect weights for a variant: {}".format(list(u_counts['accession'].unique()))
+    labelled = (df.join(best_matches, how='left', on=['row_nr', 'accession', 'ID'])
+                .select(pl.exclude("^.*_right$")))
+    assert labelled.shape[0] == df.shape[0]  # don't want to lose any rows from the input df
 
-    logger.debug("Match pruning complete")
+    return labelled
 
-    return distinct.with_column(pl.lit(True).alias('passes_pruning'))
 
+def _label_duplicates(df: pl.DataFrame) -> pl.DataFrame:
+    """ Label scorefile (accession) matches with only one ID match (singletons) vs. multiple (duplicates)"""
+    logger.debug('Labelling multiple accession - ID rows as duplicates')
 
-def _divide_matches(df: pl.DataFrame) -> tuple [ pl.DataFrame, pl.DataFrame ]:
-    """ Divide scorefile (accession) matches with only one ID match (singletons) vs. multiple (duplicates)"""
     join_cols = ['accession', 'ID']
     counted = df.groupby(join_cols).count()
     singletons = (counted.filter(pl.col('count') == 1)[:, join_cols]
-                         .join(df, on=join_cols, how='left'))
+                         .join(df, on=join_cols, how='left')
+                         .with_column(pl.lit(False).alias('duplicate')))
     duplicates = (counted.filter(pl.col('count') > 1)[:, join_cols]
-                         .join(df, on=join_cols, how='left'))
+                         .join(df, on=join_cols, how='left')
+                         .with_column(pl.lit(True).alias('duplicate')))
 
-    return singletons, duplicates
+    return pl.concat([singletons, duplicates])
 
 
-def _prioritise_match_type(all_matches: pl.DataFrame) -> pl.DataFrame:
-    # Select best match for each row in the scoring file
+def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
     match_priority = ['refalt', 'altref', 'refalt_flip', 'altref_flip', 'no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip',
                       'no_oa_alt_flip']
-    return _get_best_match(all_matches, match_priority)
-
-
-def _get_best_match(df: pl.DataFrame, match_priority: list[str]) -> pl.DataFrame:
     match: list[pl.DataFrame] = []
     for match_type in match_priority:
         logger.debug(f"Selecting matches with match type {match_type}")
         match.append(df.filter(pl.col("match_type") == match_type))
-    logger.debug("Prioritising match types (refalt > altref > ...)")
-    return reduce(lambda x, y: _join_best_match(x, y), match)
+
+    logger.debug("Labelling best match type (refalt > altref > ...)")
+    best_match: pl.DataFrame = reduce(lambda x, y: _prioritise_best_match(x, y), match)
+    return best_match.with_column(pl.lit(True).alias('best_match'))
 
 
-def _join_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame:
+def _prioritise_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame:
     # variants in dataframe x have a higher priority than dataframe y
     # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x
-    not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'row_nr'])
+    not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'ID', 'row_nr'])
     return pl.concat([x, not_in])
diff --git a/tests/test_match.py b/tests/test_match.py
index 70b9671..0dc9a2a 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -46,14 +46,14 @@ def test_match_strategies(small_scorefile, small_target):
     scorefile, target = _cast_cat(small_scorefile, small_target)
 
     # check unambiguous matches
-    df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True, keep_first_match=False)
+    df = get_all_matches(scorefile, target, skip_flip=True)
     assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'})
     assert set(df['match_type'].to_list()).issubset(['altref', 'refalt'])
 
     # when keeping ambiguous and flipping alleles:
     #   2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip'
     # flipped matches should be dropped for ambiguous matches
-    flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False)\
+    flip = (get_all_matches(scorefile, target, skip_flip=False)\
         .filter(pl.col('ambiguous') == True))
 
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
@@ -63,13 +63,13 @@ def test_match_strategies(small_scorefile, small_target):
 def test_no_oa_match(small_scorefile_no_oa, small_target):
     scorefile, target = _cast_cat(small_scorefile_no_oa, small_target)
 
-    df = get_all_matches(scorefile, target, remove_ambiguous=True,skip_flip=True, keep_first_match=False)
+    df = get_all_matches(scorefile, target, skip_flip=True)
 
     assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
     assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref'])
 
     # one of the matches is ambiguous
-    flip = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False)
+    flip = (get_all_matches(scorefile, target, skip_flip=False)
             .filter(pl.col('ambiguous') == True))
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'no_oa_alt'})
@@ -78,14 +78,14 @@ def test_no_oa_match(small_scorefile_no_oa, small_target):
 def test_flip_match(small_flipped_scorefile, small_target):
     scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
 
-    df = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=True, keep_first_match=False)
+    df = get_all_matches(scorefile, target, skip_flip=True)
     assert df.is_empty()
 
-    flip = get_all_matches(scorefile, target, remove_ambiguous=True, skip_flip=False, keep_first_match=False)
+    flip = get_all_matches(scorefile, target, skip_flip=False)
     assert flip['match_type'].str.contains('flip').all()
     assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
 
-    flip_ambig = (get_all_matches(scorefile, target, remove_ambiguous=False, skip_flip=False, keep_first_match=False)
+    flip_ambig = (get_all_matches(scorefile, target, skip_flip=False)
                   .filter(pl.col('ambiguous') == True))
     assert not flip_ambig['match_type'].str.contains('flip').any()  # no flip matches for ambiguous
 

From 98b34873ecb25dfc6f345a2ca7792719e49afecd Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 6 Sep 2022 17:14:42 +0100
Subject: [PATCH 32/59] fix tests

---
 tests/test_combine.py |  2 +-
 tests/test_match.py   | 27 +++++++++++----------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/tests/test_combine.py b/tests/test_combine.py
index ae7de87..8be71c2 100644
--- a/tests/test_combine.py
+++ b/tests/test_combine.py
@@ -11,7 +11,7 @@
 def test_combine_scorefiles(combined_scorefile, _n_variants):
     df = pd.read_table(combined_scorefile)
     cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
-            'is_duplicated', 'accession'}
+            'is_duplicated', 'accession', 'row_nr'}
     assert set(df.columns).issubset(cols)
     assert df.shape[0] == _n_variants
 
diff --git a/tests/test_match.py b/tests/test_match.py
index 0dc9a2a..42d0e87 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -46,53 +46,48 @@ def test_match_strategies(small_scorefile, small_target):
     scorefile, target = _cast_cat(small_scorefile, small_target)
 
     # check unambiguous matches
-    df = get_all_matches(scorefile, target, skip_flip=True)
+    df = get_all_matches(scorefile, target, skip_flip=True).filter(pl.col('ambiguous') == False)
     assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'})
     assert set(df['match_type'].to_list()).issubset(['altref', 'refalt'])
 
-    # when keeping ambiguous and flipping alleles:
-    #   2:2:T:A is ambiguous, and matches 'altref' and 'refalt_flip'
-    # flipped matches should be dropped for ambiguous matches
-    flip = (get_all_matches(scorefile, target, skip_flip=False)\
-        .filter(pl.col('ambiguous') == True))
+    # when keeping ambiguous and flipping alleles
+    flip = (get_all_matches(scorefile, target, skip_flip=False).filter(pl.col('ambiguous') == True))
 
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
-    assert set(flip['match_type'].to_list()).issubset({'altref'})
+    assert set(flip['match_type'].to_list()).issubset({'altref', 'refalt_flip'})
 
 
 def test_no_oa_match(small_scorefile_no_oa, small_target):
     scorefile, target = _cast_cat(small_scorefile_no_oa, small_target)
 
-    df = get_all_matches(scorefile, target, skip_flip=True)
+    df = get_all_matches(scorefile, target, skip_flip=True).filter(pl.col('ambiguous') == False)
 
     assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
     assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref'])
 
-    # one of the matches is ambiguous
+    # check ambiguous matches
     flip = (get_all_matches(scorefile, target, skip_flip=False)
             .filter(pl.col('ambiguous') == True))
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
-    assert set(flip['match_type'].to_list()).issubset({'no_oa_alt'})
+    assert set(flip['match_type'].to_list()).issubset({'no_oa_alt', 'no_oa_ref_flip'})
 
 
 def test_flip_match(small_flipped_scorefile, small_target):
     scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
 
     df = get_all_matches(scorefile, target, skip_flip=True)
-    assert df.is_empty()
+    assert set(df['ambiguous']) == {True}
+    assert set(df['match_type']) == {'refalt'}
 
-    flip = get_all_matches(scorefile, target, skip_flip=False)
+    flip = get_all_matches(scorefile, target, skip_flip=False).filter(pl.col('ambiguous') == False)
     assert flip['match_type'].str.contains('flip').all()
     assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
 
-    flip_ambig = (get_all_matches(scorefile, target, skip_flip=False)
-                  .filter(pl.col('ambiguous') == True))
-    assert not flip_ambig['match_type'].str.contains('flip').any()  # no flip matches for ambiguous
-
 
 @pytest.fixture
 def small_scorefile():
     df = pl.DataFrame({"accession": ["test", "test", "test"],
+                       "row_nr": [1, 2, 3],
                        "chr_name": [1, 2, 3],
                        "chr_position": [1, 2, 3],
                        "effect_allele": ["A", "A", "G"],

From 856c7bc65f3a194e93253024ba1c089a226f9e44 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 16:38:12 +0100
Subject: [PATCH 33/59] stop using joins with labelling, because variants went
 missing

---
 pgscatalog_utils/match/label.py          | 95 ++++++++++++++++++++++++
 pgscatalog_utils/match/match.py          |  7 +-
 pgscatalog_utils/match/match_variants.py | 36 +++++----
 pgscatalog_utils/match/postprocess.py    | 86 ---------------------
 4 files changed, 120 insertions(+), 104 deletions(-)
 create mode 100644 pgscatalog_utils/match/label.py
 delete mode 100644 pgscatalog_utils/match/postprocess.py

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
new file mode 100644
index 0000000..4291fb6
--- /dev/null
+++ b/pgscatalog_utils/match/label.py
@@ -0,0 +1,95 @@
+import logging
+
+import polars as pl
+
+from pgscatalog_utils.match.preprocess import complement_valid_alleles
+
+logger = logging.getLogger(__name__)
+
+
+def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.DataFrame:
+    """ Label match candidates with additional metadata. Column definitions:
+
+    - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function)
+    - best_match: True if row is the best possible match type (refalt > altref > ...)
+    - duplicate: True if more than one best match exists for the same accession and ID
+    - ambiguous: True if ambiguous
+    """
+    return (df.with_column(pl.lit(True).alias('match_candidate'))
+            .pipe(_label_biallelic_ambiguous, remove_ambiguous)
+            .pipe(_label_best_match)
+            .pipe(_label_duplicate_best_match, keep_first_match))
+
+
+def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame:
+    logger.debug("Labelling ambiguous variants")
+    ambig = ((df.with_columns([
+        pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str),
+        pl.lit(True).alias("ambiguous")])
+        .pipe(complement_valid_alleles, ["REF"]))
+             .with_column(pl.when(pl.col("REF_FLIP") == pl.col("ALT"))
+                          .then(pl.col("ambiguous"))
+                          .otherwise(False)))
+
+    if remove_ambiguous:
+        logger.debug("Labelling ambiguous variants with exclude flag")
+        return ambig.with_column(pl.when(pl.col('ambiguous') == True)
+                                 .then(True)
+                                 .otherwise(False)
+                                 .alias('exclude'))
+    else:
+        return ambig.with_column(pl.lit(False).alias('exclude'))
+
+
+def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
+    logger.debug("Labelling best match type (refalt > altref > ...)")
+    match_priority = {'refalt': 0, 'altref': 1, 'refalt_flip': 2, 'altref_flip': 3, 'no_oa_ref': 4, 'no_oa_alt': 5,
+                      'no_oa_ref_flip': 6, 'no_oa_alt_flip': 7}
+    match_priority_rev = {v: k for k, v in match_priority.items()}
+
+    # use a groupby aggregation to guarantee the number of rows stays the same
+    # rows were being lost using an anti join + reduce approach
+    prioritised: pl.DataFrame = (df.with_column(pl.col('match_type')
+                                                .apply(lambda x: match_priority[x])
+                                                .alias('match_priority'))
+                                 .with_column(pl.col("match_priority")
+                                              .min()
+                                              .over(["accession", "row_nr"])
+                                              .apply(lambda x: match_priority_rev[x])
+                                              .alias('best_match_type'))
+                                 .with_column(pl.when(pl.col('best_match_type') == pl.col('match_type'))
+                                              .then(pl.lit(True))
+                                              .otherwise(pl.lit(False))
+                                              .alias('best_match')))
+    assert prioritised.shape[0] == df.shape[0]  # I'm watching you, Wazowski. Always watching. Always.
+    return prioritised.drop(['match_priority', 'best_match_type'])
+
+
+def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match) -> pl.DataFrame:
+    logger.debug('Labelling duplicated best matches')
+    duplicates = (df.with_column(pl.col('best_match')
+                                 .count()
+                                 .over(['accession', 'ID', 'best_match'])
+                                 .alias('count'))
+                  .with_column(pl.when(pl.col('count') > 1)
+                               .then(pl.lit(True))
+                               .otherwise(pl.lit(False))
+                               .alias('duplicate'))
+                  .drop('count'))
+
+    if keep_first_match:
+        logger.debug("Keeping first duplicate, labelling others with exclude flag ")
+        # set first duplicate (with the smallest row_nr) to exclude = false
+        labelled = duplicates.with_column(pl.when((pl.col("duplicate") == True) &
+                                                  (pl.col("row_nr") != pl.min("row_nr")
+                                                   .over(["accession", "ID", "duplicate"])))
+                                          .then(True)
+                                          .otherwise(False)
+                                          .alias('exclude_duplicate'))
+    else:
+        logger.debug("Labelling all duplicates with exclude flag")
+        labelled = duplicates.with_column(pl.lit(False).alias('exclude_duplicate'))
+
+    # get the horizontal maximum to combine the exclusion columns for each variant
+    return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"]))
+            .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"})
diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 9387146..677f22a 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -2,12 +2,13 @@
 
 import polars as pl
 
-from pgscatalog_utils.match.postprocess import postprocess_matches
+from pgscatalog_utils.match.label import label_matches
 
 logger = logging.getLogger(__name__)
 
 
-def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool) -> pl.DataFrame:
+def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool, remove_ambiguous: bool,
+                    keep_first_match: bool) -> pl.DataFrame:
     scorefile_cat, target_cat = _cast_categorical(scorefile, target)
     scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None)
     scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None)
@@ -33,7 +34,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bo
             matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order))
             matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order))
 
-    return pl.concat(matches).pipe(postprocess_matches)
+    return pl.concat(matches).pipe(label_matches, remove_ambiguous, keep_first_match)
 
 
 def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame:
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 6937a90..b6962e4 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -6,10 +6,11 @@
 import polars as pl
 
 from pgscatalog_utils.log_config import set_logging_level
+from pgscatalog_utils.match.log import make_logs
 from pgscatalog_utils.match.match import get_all_matches
 from pgscatalog_utils.match.filter import filter_scores
 from pgscatalog_utils.match.read import read_target, read_scorefile
-from pgscatalog_utils.match.write import write_out
+from pgscatalog_utils.match.write import write_out, write_log
 
 logger = logging.getLogger(__name__)
 
@@ -20,9 +21,10 @@ def match_variants():
     set_logging_level(args.verbose)
 
     logger.debug(f"polars n_threads: {pl.threadpool_size()}")
-    scorefile: pl.DataFrame = read_scorefile(path=args.scorefile)
 
     with pl.StringCache():
+        scorefile: pl.DataFrame = read_scorefile(path=args.scorefile)
+
         n_target_files = len(glob(args.target))
         matches: pl.DataFrame
 
@@ -36,27 +38,31 @@ def match_variants():
         match match_mode:
             case "single":
                 logger.debug(f"Match mode: {match_mode}")
-                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.skip_flip)
+                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
+                                               args.remove_ambiguous, args.keep_first_match)
             case "multi":
                 logger.debug(f"Match mode: {match_mode}")
-                matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic,
-                                                  args.skip_flip)
+                matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
+                                                  args.remove_ambiguous, args.keep_first_match)
             case "fast":
                 logger.debug(f"Match mode: {match_mode}")
-                matches = _fast_match(args.target, scorefile, args.remove_multiallelic,
-                                      args.skip_flip)
+                matches = _fast_match(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
+                                      args.remove_ambiguous, args.keep_first_match)
             case _:
                 logger.critical(f"Invalid match mode: {match_mode}")
                 raise Exception
 
         dataset = args.dataset.replace('_', '-')  # underscores are delimiters in pgs catalog calculator
-        valid_matches = filter_scores(scorefile, matches, args.remove_ambiguous, args.keep_first_match, args.min_overlap,
-                                      dataset)
+        valid_matches, filter_summary = filter_scores(scorefile, matches, args.remove_ambiguous,
+                                                      args.keep_first_match, args.min_overlap, dataset)
 
         if valid_matches.is_empty():  # this can happen if args.min_overlap = 0
             logger.error("Error: no target variants match any variants in scoring files")
             raise Exception
 
+        big_log, summary_log = make_logs(scorefile, matches, filter_summary, args.dataset)
+
+        write_log(big_log, args.dataset)
         write_out(valid_matches, args.split, args.outdir, dataset)
 
 
@@ -70,37 +76,37 @@ def _check_target_chroms(target) -> None:
 
 
 def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                skip_filp: bool) -> pl.DataFrame:
+                skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
     # fast match is fast because:
     #   1) all target files are read into memory
     #   2) matching occurs without iterating through chromosomes
     target: pl.DataFrame = read_target(path=target_path,
                                        remove_multiallelic=remove_multiallelic)
     logger.debug("Split target chromosomes not checked with fast match mode")
-    return get_all_matches(scorefile, target, skip_filp)
+    return get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match)
 
 
 def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                            skip_filp: bool) -> pl.DataFrame:
+                            skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
     matches = []
     for i, loc_target_current in enumerate(glob(target_path)):
         logger.debug(f'Matching scorefile(s) against target: {loc_target_current}')
         target: pl.DataFrame = read_target(path=loc_target_current,
                                            remove_multiallelic=remove_multiallelic)
         _check_target_chroms(target)
-        matches.append(get_all_matches(scorefile, target, skip_filp))
+        matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match))
     return pl.concat(matches)
 
 
 def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                         skip_filp: bool) -> pl.DataFrame:
+                         skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
     matches = []
     for chrom in scorefile['chr_name'].unique().to_list():
         target = read_target(target_path, remove_multiallelic=remove_multiallelic,
                              single_file=True, chrom=chrom)  # scans and filters
         if target:
             logger.debug(f"Matching chromosome {chrom}")
-            matches.append(get_all_matches(scorefile, target, skip_filp))
+            matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match))
 
     return pl.concat(matches)
 
diff --git a/pgscatalog_utils/match/postprocess.py b/pgscatalog_utils/match/postprocess.py
deleted file mode 100644
index 0ae0b2b..0000000
--- a/pgscatalog_utils/match/postprocess.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import logging
-from functools import reduce
-
-import polars as pl
-
-from pgscatalog_utils.match.preprocess import complement_valid_alleles
-
-logger = logging.getLogger(__name__)
-
-
-def postprocess_matches(df: pl.DataFrame) -> pl.DataFrame:
-    """ Label match candidates with additional metadata. Column definitions:
-
-    - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function)
-    - best_match: True if row is the best possible match type (refalt > altref > ...)
-    - duplicate: True if >1 scoring file line matches to the same variant ID
-    - ambiguous: True if ambiguous
-    """
-    return (df.with_column(pl.lit(True).alias('match_candidate'))
-            .pipe(_label_biallelic_ambiguous)
-            .pipe(_label_pruned_matches))
-
-
-def _label_biallelic_ambiguous(df: pl.DataFrame) -> pl.DataFrame:
-    logger.debug("Labelling ambiguous variants")
-    df = df.with_columns([
-        pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str),
-        pl.lit(True).alias("ambiguous")
-    ]).pipe(complement_valid_alleles, ["REF"])
-
-    return (df.with_column(
-        pl.when(pl.col("REF_FLIP") == pl.col("ALT"))
-        .then(pl.col("ambiguous"))
-        .otherwise(False)))
-
-
-def _label_pruned_matches(df: pl.DataFrame) -> pl.DataFrame:
-    best_matches = (df.pipe(_label_best_match)
-                    .pipe(_label_duplicates))
-
-    # check that duplicates were correctly labelled
-    u_counts = best_matches.filter(pl.col('duplicate') == False).groupby(['accession', 'ID']).count()
-    assert (u_counts['count'] == 1).all(), \
-        "Duplicate effect weights for a variant: {}".format(list(u_counts['accession'].unique()))
-
-    labelled = (df.join(best_matches, how='left', on=['row_nr', 'accession', 'ID'])
-                .select(pl.exclude("^.*_right$")))
-    assert labelled.shape[0] == df.shape[0]  # don't want to lose any rows from the input df
-
-    return labelled
-
-
-def _label_duplicates(df: pl.DataFrame) -> pl.DataFrame:
-    """ Label scorefile (accession) matches with only one ID match (singletons) vs. multiple (duplicates)"""
-    logger.debug('Labelling multiple accession - ID rows as duplicates')
-
-    join_cols = ['accession', 'ID']
-    counted = df.groupby(join_cols).count()
-    singletons = (counted.filter(pl.col('count') == 1)[:, join_cols]
-                         .join(df, on=join_cols, how='left')
-                         .with_column(pl.lit(False).alias('duplicate')))
-    duplicates = (counted.filter(pl.col('count') > 1)[:, join_cols]
-                         .join(df, on=join_cols, how='left')
-                         .with_column(pl.lit(True).alias('duplicate')))
-
-    return pl.concat([singletons, duplicates])
-
-
-def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
-    match_priority = ['refalt', 'altref', 'refalt_flip', 'altref_flip', 'no_oa_ref', 'no_oa_alt', 'no_oa_ref_flip',
-                      'no_oa_alt_flip']
-    match: list[pl.DataFrame] = []
-    for match_type in match_priority:
-        logger.debug(f"Selecting matches with match type {match_type}")
-        match.append(df.filter(pl.col("match_type") == match_type))
-
-    logger.debug("Labelling best match type (refalt > altref > ...)")
-    best_match: pl.DataFrame = reduce(lambda x, y: _prioritise_best_match(x, y), match)
-    return best_match.with_column(pl.lit(True).alias('best_match'))
-
-
-def _prioritise_best_match(x: pl.DataFrame, y: pl.DataFrame) -> pl.DataFrame:
-    # variants in dataframe x have a higher priority than dataframe y
-    # when concatenating the two dataframes, use an anti join to first remove variants in y that are in x
-    not_in: pl.DataFrame = y.join(x, how='anti', on=['accession', 'ID', 'row_nr'])
-    return pl.concat([x, not_in])

From 2abfbe12f79b6b87be893b6843ed04a0e73203ba Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 16:38:57 +0100
Subject: [PATCH 34/59] update filter to use new flags

---
 pgscatalog_utils/match/filter.py | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py
index caa9360..5256a57 100644
--- a/pgscatalog_utils/match/filter.py
+++ b/pgscatalog_utils/match/filter.py
@@ -2,14 +2,12 @@
 
 import polars as pl
 
-from pgscatalog_utils.match.log import write_log
-
 logger = logging.getLogger(__name__)
 
 
 def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool,
-                  min_overlap: float, dataset: str) -> pl.DataFrame:
-    """ Remove scores that don't match well """
+                  min_overlap: float, dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
+    """ Remove scores that don't match well and return a summary report df"""
     scorefile: pl.DataFrame = scorefile.with_columns([
         pl.col('effect_type').cast(pl.Categorical),
         pl.col('accession').cast(pl.Categorical)])  # same dtypes for join
@@ -17,7 +15,7 @@ def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguo
     # matches may contain more than one row per variant in the scoring file
     # e.g., one ambiguous match and one clear match, or duplicates may be in the scoring file
     filtered_matches: pl.DataFrame = _filter_matches(matches, remove_ambiguous, keep_first_match)
-    match_log: pl.DataFrame = _join_matches(filtered_matches, scorefile, dataset)
+    match_log: pl.DataFrame = _join_filtered_matches(filtered_matches, scorefile, dataset)
     match_log['best_match'] = match_log['best_match'].fill_null(False)
 
     fail_rates: pl.DataFrame = _calculate_match_rate(match_log)
@@ -27,17 +25,17 @@ def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguo
         if rate < (1 - min_overlap):
             df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [True], 'match_rate': [1 - rate]})
             logger.debug(f"Score {accession} passes minimum matching threshold ({1 - rate:.2%}  variants match)")
-            scores.append(df)
+            scores.append(df.with_column(pl.col('accession').cast(pl.Categorical)))
         else:
             df: pl.DataFrame = pl.DataFrame({'accession': [accession], 'score_pass': [False], 'match_rate': [1 - rate]})
             logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)")
-            scores.append(df)
+            scores.append(df.with_column(pl.col('accession').cast(pl.Categorical)))
 
-    (match_log.with_column(pl.col('accession').cast(str))
-     .join(pl.concat(scores), on='accession', how='left')).pipe(write_log, dataset)  # write log to gzipped CSV
+    score_summary: pl.DataFrame = pl.concat(scores)
+    filtered_scores: pl.DataFrame = (filtered_matches.join(score_summary, on='accession', how='left')
+                                     .filter(pl.col('score_pass') == True))
 
-    return (filtered_matches.with_column(pl.col('accession').cast(str))
-            .join(pl.concat(scores), on='accession', how='left'))
+    return filtered_scores, score_summary
 
 
 def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame:
@@ -70,18 +68,15 @@ def _handle_duplicates(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame
     singletons = df.filter(pl.col('duplicate') == False)
     if keep_first_match:
         logger.debug("Filtering: keeping first match")
-        first = (df.filter(pl.col('duplicate') == True)
-                 .groupby(["accession", "ID"])
-                 .agg([pl.col("row_nr").first()])
-                 .join(df, on=['accession', 'row_nr'], how='left'))
-        return pl.concat([singletons, first.select(singletons.columns)])
+        first = df.filter((pl.col('duplicate') == True) & (pl.col('exclude') == False))
+        return pl.concat([singletons, first])
     else:
         logger.debug("Filtering: dropping any duplicate matches")
         return singletons
 
 
-def _join_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame:
-    return (scorefile.join(matches, on=['accession', 'row_nr'], how='left')
+def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame:
+    return (scorefile.join(matches, on=['row_nr', 'accession'], how='left')
             .with_column(pl.lit(dataset).alias('dataset'))
             .select(pl.exclude("^.*_right$")))
 

From cae522e54ee1a3b179f942aba2395c7e669a670c Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 16:39:12 +0100
Subject: [PATCH 35/59] move write_log to write module

---
 pgscatalog_utils/match/log.py   | 4 ----
 pgscatalog_utils/match/write.py | 7 +++++++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 13085cb..467800b 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -6,10 +6,6 @@
 logger = logging.getLogger(__name__)
 
 
-def write_log(df: pl.DataFrame, dataset: str) -> None:
-    logger.debug("Compressing and writing log")
-    with gzip.open(f"{dataset}_log.csv.gz", 'wb') as f:
-        df.pipe(_prettify_log).write_csv(f)
 
 
 def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py
index 7a8a880..8243c8f 100644
--- a/pgscatalog_utils/match/write.py
+++ b/pgscatalog_utils/match/write.py
@@ -1,3 +1,4 @@
+import gzip
 import logging
 import os
 
@@ -6,6 +7,12 @@
 logger = logging.getLogger(__name__)
 
 
+def write_log(df: pl.DataFrame, dataset: str) -> None:
+    logger.debug("Compressing and writing log")
+    with gzip.open(f"{dataset}_log.csv.gz", 'wb') as f:
+        df.write_csv(f)
+
+
 def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None:
     if not os.path.isdir(outdir):
         os.mkdir(outdir)

From a08ca6e3589828a0e0596d93839aad4c2330dcf0 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 16:39:27 +0100
Subject: [PATCH 36/59] make a summary log

---
 pgscatalog_utils/match/log.py | 41 ++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 467800b..53cdb07 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -1,4 +1,3 @@
-import gzip
 import logging
 
 import polars as pl
@@ -6,10 +5,42 @@
 logger = logging.getLogger(__name__)
 
 
+def make_logs(scorefile, match_candidates, filter_summary, dataset):
+    big_log = (_join_match_candidates(scorefile, match_candidates, dataset)
+               .pipe(_prettify_log))
+    summary_log = make_summary_log(big_log, filter_summary)
+
+    return _prettify_log(big_log), summary_log
+
+
+def make_summary_log(df, filter_summary):
+    """ Make an aggregated table """
+    return (df.filter(pl.col('best_match') != False)
+            .groupby(['dataset', 'accession', 'best_match', 'ambiguous', 'is_multiallelic', 'duplicate', 'exclude'])
+            .count()
+            .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], reverse=True)
+
+
+def _prettify_summary(df: pl.DataFrame):
+    keep_cols = ["dataset", "accession", "score_pass", "ambiguous", "is_multiallelic", "duplicate", "count"]
 
 
 def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
-    keep_cols = ["chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type",
-                 "accession", "row_nr", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
-                 "ambiguous", "duplicate", "best_match", "dataset", "score_pass", "match_rate"]
-    return df.select(keep_cols).select(pl.exclude("^.*_right"))
+    keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight",
+                 "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
+                 "ambiguous", "duplicate", "best_match", "exclude", "dataset"]
+    pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right")))
+    return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"])
+
+
+def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, dataset: str) -> pl.DataFrame:
+    """
+    Join match candidates against the original scoring file
+
+    Uses an outer join because mltiple match candidates may exist with different match types
+
+    Multiple match candidates will exist as extra rows in the joined dataframe
+    """
+    return (scorefile.join(matches, on=['row_nr', 'accession'], how='outer')
+            .with_column(pl.lit(dataset).alias('dataset'))
+            .select(pl.exclude("^.*_right$")))

From 1a07125af087a48121ed3abca83075438dda1745 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 16:39:32 +0100
Subject: [PATCH 37/59] cast accession to categorical when first reading
 scorefile

---
 pgscatalog_utils/match/read.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index f8f5b3e..d1824a2 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -48,7 +48,8 @@ def read_target(path: str, remove_multiallelic: bool, single_file: bool = False,
 def read_scorefile(path: str) -> pl.DataFrame:
     logger.debug("Reading scorefile")
     scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str})
-                               .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele']))
+                               .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])
+                               .with_column(pl.col('accession').cast(pl.Categorical)))
     return scorefile
 
 

From a59b31324d0d594a0d8a284dedeed4fe05adf0bc Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 17:54:30 +0100
Subject: [PATCH 38/59] update polars

---
 poetry.lock    | 13 +++++++------
 pyproject.toml |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e920a73..d776774 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -49,7 +49,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
 [[package]]
 name = "coverage"
-version = "6.4.3"
+version = "6.4.4"
 description = "Code coverage measurement for Python"
 category = "dev"
 optional = false
@@ -134,19 +134,20 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "polars"
-version = "0.13.62"
+version = "0.14.9"
 description = "Blazingly fast DataFrame library"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 
 [package.extras]
+pandas = ["pyarrow (>=4.0)", "pandas"]
+connectorx = ["connectorx"]
+numpy = ["numpy (>=1.16.0)"]
 fsspec = ["fsspec"]
 xlsx2csv = ["xlsx2csv (>=0.8.0)"]
-connectorx = ["connectorx"]
-pandas = ["pyarrow (>=4.0)", "pandas"]
+pytz = ["pytz"]
 pyarrow = ["pyarrow (>=4.0)"]
-numpy = ["numpy (>=1.16.0)"]
 
 [[package]]
 name = "py"
@@ -288,7 +289,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.10"
-content-hash = "31cffdaa5cb10864005af569ed7ab3142071abe6934d06789bac6a00ca2ba1ee"
+content-hash = "607d2d543f52a4ecc116c0b912c499a83cd1c740244323c81fdfe89ba27a55eb"
 
 [metadata.files]
 atomicwrites = []
diff --git a/pyproject.toml b/pyproject.toml
index acfcb36..b9899ab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ pandas = "^1.4.3"
 pyliftover = "^0.4"
 requests = "^2.28.1"
 jq = "^1.2.2"
-polars = "^0.13.59"
+polars = "^0.14.9"
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.1.2"

From d2c3fdc0bb0cd2d2101e9c0c52911b3b877d26fc Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 17:54:38 +0100
Subject: [PATCH 39/59] don't set columns directly

---
 pgscatalog_utils/match/filter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py
index 5256a57..b34850f 100644
--- a/pgscatalog_utils/match/filter.py
+++ b/pgscatalog_utils/match/filter.py
@@ -15,8 +15,8 @@ def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguo
     # matches may contain more than one row per variant in the scoring file
     # e.g., one ambiguous match and one clear match, or duplicates may be in the scoring file
     filtered_matches: pl.DataFrame = _filter_matches(matches, remove_ambiguous, keep_first_match)
-    match_log: pl.DataFrame = _join_filtered_matches(filtered_matches, scorefile, dataset)
-    match_log['best_match'] = match_log['best_match'].fill_null(False)
+    match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset)
+                               .with_columns(pl.col('best_match').fill_null(False)))
 
     fail_rates: pl.DataFrame = _calculate_match_rate(match_log)
 

From 6e46eb845f50ded5a93d3c303b07cebfc6237bcc Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 17:54:58 +0100
Subject: [PATCH 40/59] encode match status (matched / unmatched / excluded)

---
 pgscatalog_utils/match/label.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 4291fb6..485040d 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -15,10 +15,21 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
     - duplicate: True if more than one best match exists for the same accession and ID
     - ambiguous: True if ambiguous
     """
-    return (df.with_column(pl.lit(True).alias('match_candidate'))
-            .pipe(_label_biallelic_ambiguous, remove_ambiguous)
-            .pipe(_label_best_match)
-            .pipe(_label_duplicate_best_match, keep_first_match))
+    labelled = (df.with_column(pl.lit(True).alias('match_candidate'))
+                .pipe(_label_biallelic_ambiguous, remove_ambiguous)
+                .pipe(_label_best_match)
+                .pipe(_label_duplicate_best_match, keep_first_match))
+
+    # encode a new column called match status containing matched, unmatched, and excluded
+    return (labelled.with_columns([
+        # set false best match to excluded
+        pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 2}[x]).alias('match_priority'),
+        pl.col('exclude').apply(lambda x: {None: 0, True: 2, False: 0}[x]).alias('excluded_match_priority')
+    ])
+            .with_column(pl.max(["match_priority", "excluded_match_priority"]))
+            .with_column(pl.col("max")
+                         .apply(lambda x: {0: 'unmatched', 1: 'matched', 2: 'excluded'}[x])
+                         .alias('match_status'))).drop(["max", "excluded_match_priority", "match_priority"])
 
 
 def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame:
@@ -26,7 +37,7 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra
     ambig = ((df.with_columns([
         pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str),
         pl.lit(True).alias("ambiguous")])
-        .pipe(complement_valid_alleles, ["REF"]))
+              .pipe(complement_valid_alleles, ["REF"]))
              .with_column(pl.when(pl.col("REF_FLIP") == pl.col("ALT"))
                           .then(pl.col("ambiguous"))
                           .otherwise(False)))

From b7db4a2e0061d55e7a857006d58192906dcf8f3a Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 17:55:10 +0100
Subject: [PATCH 41/59] make nice logs

---
 pgscatalog_utils/match/log.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 53cdb07..dcff82c 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -10,25 +10,30 @@ def make_logs(scorefile, match_candidates, filter_summary, dataset):
                .pipe(_prettify_log))
     summary_log = make_summary_log(big_log, filter_summary)
 
-    return _prettify_log(big_log), summary_log
+    return _prettify_log(big_log), _prettify_summary(summary_log)
 
 
 def make_summary_log(df, filter_summary):
     """ Make an aggregated table """
-    return (df.filter(pl.col('best_match') != False)
-            .groupby(['dataset', 'accession', 'best_match', 'ambiguous', 'is_multiallelic', 'duplicate', 'exclude'])
+    return (df.groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate'])
             .count()
-            .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'], reverse=True)
+            .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'],
+                                                                    reverse=True)
 
 
 def _prettify_summary(df: pl.DataFrame):
-    keep_cols = ["dataset", "accession", "score_pass", "ambiguous", "is_multiallelic", "duplicate", "count"]
+    keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", "duplicate",
+                 "count", "percent"]
+    return (df.with_column((pl.col("count") / pl.sum("count"))
+                           .over(["dataset", "accession"])
+                           .alias("percent"))
+            .select(keep_cols))
 
 
 def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
     keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight",
                  "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
-                 "ambiguous", "duplicate", "best_match", "exclude", "dataset"]
+                 "ambiguous", "duplicate", "match_status", "dataset"]
     pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right")))
     return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"])
 
@@ -42,5 +47,6 @@ def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, datas
     Multiple match candidates will exist as extra rows in the joined dataframe
     """
     return (scorefile.join(matches, on=['row_nr', 'accession'], how='outer')
-            .with_column(pl.lit(dataset).alias('dataset'))
-            .select(pl.exclude("^.*_right$")))
+          .with_column(pl.lit(dataset).alias('dataset'))
+          .select(pl.exclude("^.*_right$"))).with_column(pl.col('match_status').fill_null("unmatched"))
+

From 4d0c239466ccd2767b0604ddec45cd2c1e24e843 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 7 Sep 2022 18:14:08 +0100
Subject: [PATCH 42/59] remove not best from match log

---
 pgscatalog_utils/match/label.py | 4 ++--
 pgscatalog_utils/match/log.py   | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 485040d..b562594 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -23,12 +23,12 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
     # encode a new column called match status containing matched, unmatched, and excluded
     return (labelled.with_columns([
         # set false best match to excluded
-        pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 2}[x]).alias('match_priority'),
+        pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 3}[x]).alias('match_priority'),
         pl.col('exclude').apply(lambda x: {None: 0, True: 2, False: 0}[x]).alias('excluded_match_priority')
     ])
             .with_column(pl.max(["match_priority", "excluded_match_priority"]))
             .with_column(pl.col("max")
-                         .apply(lambda x: {0: 'unmatched', 1: 'matched', 2: 'excluded'}[x])
+                         .apply(lambda x: {0: 'unmatched', 1: 'matched', 2: 'excluded', 3: 'not_best'}[x])
                          .alias('match_status'))).drop(["max", "excluded_match_priority", "match_priority"])
 
 
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index dcff82c..d8af47a 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -15,7 +15,8 @@ def make_logs(scorefile, match_candidates, filter_summary, dataset):
 
 def make_summary_log(df, filter_summary):
     """ Make an aggregated table """
-    return (df.groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate'])
+    return (df.filter(pl.col('match_status') != 'not_best')
+            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate'])
             .count()
             .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'],
                                                                     reverse=True)

From 1100a4d46019c47587e6de5fabdd29c629f4edc8 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 8 Sep 2022 12:53:25 +0100
Subject: [PATCH 43/59] filter with exclude flag

---
 pgscatalog_utils/match/filter.py | 45 +++++---------------------------
 1 file changed, 7 insertions(+), 38 deletions(-)

diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py
index b34850f..c47a449 100644
--- a/pgscatalog_utils/match/filter.py
+++ b/pgscatalog_utils/match/filter.py
@@ -5,16 +5,10 @@
 logger = logging.getLogger(__name__)
 
 
-def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool,
-                  min_overlap: float, dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
-    """ Remove scores that don't match well and return a summary report df"""
-    scorefile: pl.DataFrame = scorefile.with_columns([
-        pl.col('effect_type').cast(pl.Categorical),
-        pl.col('accession').cast(pl.Categorical)])  # same dtypes for join
-
-    # matches may contain more than one row per variant in the scoring file
-    # e.g., one ambiguous match and one clear match, or duplicates may be in the scoring file
-    filtered_matches: pl.DataFrame = _filter_matches(matches, remove_ambiguous, keep_first_match)
+def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float,
+                  dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
+    """ Check overlap between filtered matches and scorefile, remove scores that don't match well and report stats """
+    filtered_matches: pl.DataFrame = _filter_matches(matches)
     match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset)
                                .with_columns(pl.col('best_match').fill_null(False)))
 
@@ -45,34 +39,9 @@ def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame:
             .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate')))
 
 
-def _filter_matches(df: pl.DataFrame, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
-    logger.debug("Final match candidate filtering")
-    return (df.filter(pl.col('best_match') == True)
-            .pipe(_handle_ambiguous, remove_ambiguous)
-            .pipe(_handle_duplicates, keep_first_match))
-
-
-def _handle_ambiguous(df: pl.DataFrame, remove_ambiguous: bool) -> pl.DataFrame:
-    if remove_ambiguous:
-        logger.debug("Filtering: Removing ambiguous matches")
-        return df.filter(pl.col("ambiguous") == False)
-    else:
-        logger.debug("Filtering: Keeping best possible match from ambiguous matches")
-        ambiguous: pl.DataFrame = df.filter((pl.col("ambiguous") == True) & \
-                                            (pl.col("match_type").str.contains('flip').is_not()))
-        unambiguous: pl.DataFrame = df.filter(pl.col("ambiguous") == False)
-        return pl.concat([ambiguous, unambiguous])
-
-
-def _handle_duplicates(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame:
-    singletons = df.filter(pl.col('duplicate') == False)
-    if keep_first_match:
-        logger.debug("Filtering: keeping first match")
-        first = df.filter((pl.col('duplicate') == True) & (pl.col('exclude') == False))
-        return pl.concat([singletons, first])
-    else:
-        logger.debug("Filtering: dropping any duplicate matches")
-        return singletons
+def _filter_matches(df: pl.DataFrame) -> pl.DataFrame:
+    logger.debug("Filtering variants with exclude flag")
+    return df.filter((pl.col('best_match') == True) & (pl.col('exclude') == False))
 
 
 def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame:

From 157374d4d9fdfca74ae199f45d9d75588b04f7ca Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 8 Sep 2022 12:53:46 +0100
Subject: [PATCH 44/59] add new check for best matches with duplicate row
 numbers

---
 pgscatalog_utils/match/label.py | 57 +++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index b562594..1636cb1 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -18,7 +18,8 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
     labelled = (df.with_column(pl.lit(True).alias('match_candidate'))
                 .pipe(_label_biallelic_ambiguous, remove_ambiguous)
                 .pipe(_label_best_match)
-                .pipe(_label_duplicate_best_match, keep_first_match))
+                .pipe(_label_duplicate_best_match, keep_first_match)
+                .pipe(_label_duplicate_row_nr))
 
     # encode a new column called match status containing matched, unmatched, and excluded
     return (labelled.with_columns([
@@ -76,7 +77,21 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
     return prioritised.drop(['match_priority', 'best_match_type'])
 
 
-def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match) -> pl.DataFrame:
+def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame:
+    """ Label best match duplicates made when the scoring file is remapped to a different genome build
+
+    ┌─────────┬────────────────────────┬─────────────┬────────────────┬─────┬────────────┐
+    │ row_nr  ┆ accession              ┆ match_type  ┆ ID             ┆ REF ┆ best_match │
+    │ ---     ┆ ---                    ┆ ---         ┆ ---            ┆ --- ┆ ---        │
+    │ i64     ┆ cat                    ┆ str         ┆ cat            ┆ str ┆ bool       │
+    ╞═════════╪════════════════════════╪═════════════╪════════════════╪═════╪════════════╡
+    │ 1194115 ┆ PGS002244_hmPOS_GRCh37 ┆ altref      ┆ 3:50924580:C:A ┆ C   ┆ true       │
+    ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    │ 1194132 ┆ PGS002244_hmPOS_GRCh37 ┆ refalt_flip ┆ 3:50924580:C:A ┆ C   ┆ true       │
+    └─────────┴────────────────────────┴─────────────┴────────────────┴─────┴────────────┘
+
+    refalt > altref > ... prioritisation doesn't fix this problem because row_nr is different (duplicated by remapping)
+    """
     logger.debug('Labelling duplicated best matches')
     duplicates = (df.with_column(pl.col('best_match')
                                  .count()
@@ -104,3 +119,41 @@ def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match) -> pl.DataFr
     # get the horizontal maximum to combine the exclusion columns for each variant
     return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"]))
             .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"})
+
+
+def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame:
+    """ A scoring file row_nr in an accession group can be duplicated if a target position has different REF, e.g.:
+
+    ┌────────┬────────────────────────┬────────────┬────────────────┬─────┬────────────┐
+    │ row_nr ┆ accession              ┆ match_type ┆ ID             ┆ REF ┆ best_match │
+    │ ---    ┆ ---                    ┆ ---        ┆ ---            ┆ --- ┆ ---        │
+    │ i64    ┆ cat                    ┆ str        ┆ cat            ┆ str ┆ bool       │
+    ╞════════╪════════════════════════╪════════════╪════════════════╪═════╪════════════╡
+    │ 38557  ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt  ┆ 3:29588979:A:G ┆ A   ┆ true       │
+    ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    │ 38557  ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt  ┆ 3:29588979:T:G ┆ T   ┆ true       │
+    └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘
+    """
+    logger.debug("Labelling duplicated matches with same row_nr")
+    labelled: pl.DataFrame = (df.with_column(pl.col('best_match')
+                                               .count()
+                                               .over(['accession', 'row_nr', 'best_match'])
+                                               .alias('count'))
+                                .with_column(pl.when(pl.col('count') > 1)
+                                             .then(pl.lit(True))
+                                             .otherwise(pl.lit(False))
+                                             .alias('duplicate'))
+                                .drop('count')
+                                .rename({'row_nr': 'score_row_nr'})
+                                .with_row_count()  # add temporary row count to get first variant
+                                .with_column(pl.when((pl.col("duplicate") == True) & (pl.col("row_nr") != pl.min("row_nr")
+                                                                                      .over(["accession", "score_row_nr"])))
+                                             .then(True)
+                                             .otherwise(False)
+                                             .alias('exclude_duplicate'))
+                                .drop('row_nr')
+                                .rename({'score_row_nr': 'row_nr'}))
+
+    # get the horizontal maximum to combine the exclusion columns for each variant
+    return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"]))
+            .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"})

From f583bb74ae6a67ef51d090407af519f192a594ff Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 8 Sep 2022 12:54:23 +0100
Subject: [PATCH 45/59] add assert to log, comparing against input scoring file

---
 pgscatalog_utils/match/log.py            | 57 +++++++++++++++++-------
 pgscatalog_utils/match/match_variants.py |  7 +--
 pgscatalog_utils/match/write.py          |  6 +--
 3 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index d8af47a..1882db2 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -6,16 +6,30 @@
 
 
 def make_logs(scorefile, match_candidates, filter_summary, dataset):
-    big_log = (_join_match_candidates(scorefile, match_candidates, dataset)
-               .pipe(_prettify_log))
-    summary_log = make_summary_log(big_log, filter_summary)
+    # best log -> aggregated into summary_log, one match per scoring file line
+    # big log -> written to compressed gzip, possibly multiple matches per scoring file line
+    summary_log, big_log = _join_match_candidates(scorefile=scorefile, matches=match_candidates,
+                                                  filter_summary=filter_summary,
+                                                  dataset=dataset)
+
+    # make sure the aggregated best log matches the scoring file accession line count
+    log_count = (scorefile.groupby("accession")
+                 .count()
+                 .join(summary_log
+                       .groupby(pl.col("accession"))
+                       .agg(pl.sum("count")),
+                       on='accession'))
+
+    assert (log_count['count'] == log_count['count_right']).all(), "Log doesn't match input scoring file"
+    logger.debug("Log matches input scoring file")
 
     return _prettify_log(big_log), _prettify_summary(summary_log)
 
 
-def make_summary_log(df, filter_summary):
+def make_summary_log(best_matches, filter_summary):
     """ Make an aggregated table """
-    return (df.filter(pl.col('match_status') != 'not_best')
+    logger.debug("Aggregating best match log into a summary table")
+    return (best_matches
             .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate'])
             .count()
             .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'],
@@ -39,15 +53,24 @@ def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
     return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"])
 
 
-def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, dataset: str) -> pl.DataFrame:
-    """
-    Join match candidates against the original scoring file
-
-    Uses an outer join because mltiple match candidates may exist with different match types
-
-    Multiple match candidates will exist as extra rows in the joined dataframe
-    """
-    return (scorefile.join(matches, on=['row_nr', 'accession'], how='outer')
-          .with_column(pl.lit(dataset).alias('dataset'))
-          .select(pl.exclude("^.*_right$"))).with_column(pl.col('match_status').fill_null("unmatched"))
-
+def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, filter_summary: pl.DataFrame,
+                           dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
+    """ Join match candidates against the original scoring file """
+    logger.debug("Making big logs")
+    # make the summary log using the best matched candidates only
+    summary_log = (scorefile.join(matches.filter(pl.col('match_status') == 'matched'),
+                                  on=['row_nr', 'accession'],
+                                  how='outer')  # left join would make checking line count later pointless
+                   .with_column(pl.lit(dataset).alias('dataset'))
+                   .select(pl.exclude("^.*_right$"))
+                   .with_column(pl.col('match_status').fill_null("unmatched"))
+                   .pipe(make_summary_log, filter_summary))
+
+    # make a raw log with all match candidates included
+    raw_log = (scorefile.join(matches,
+                              on=['row_nr', 'accession'],
+                              how='outer')
+               .with_column(pl.lit(dataset).alias('dataset'))
+               .select(pl.exclude("^.*_right$"))).with_column(pl.col('match_status').fill_null("unmatched"))
+
+    return summary_log, raw_log
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index b6962e4..336d781 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -53,8 +53,8 @@ def match_variants():
                 raise Exception
 
         dataset = args.dataset.replace('_', '-')  # underscores are delimiters in pgs catalog calculator
-        valid_matches, filter_summary = filter_scores(scorefile, matches, args.remove_ambiguous,
-                                                      args.keep_first_match, args.min_overlap, dataset)
+        valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=matches, dataset=dataset,
+                                                      min_overlap=args.min_overlap)
 
         if valid_matches.is_empty():  # this can happen if args.min_overlap = 0
             logger.error("Error: no target variants match any variants in scoring files")
@@ -62,7 +62,8 @@ def match_variants():
 
         big_log, summary_log = make_logs(scorefile, matches, filter_summary, args.dataset)
 
-        write_log(big_log, args.dataset)
+        write_log(big_log, prefix=dataset)
+        summary_log.write_csv(f"{dataset}_summary.csv")
         write_out(valid_matches, args.split, args.outdir, dataset)
 
 
diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py
index 8243c8f..32be0cf 100644
--- a/pgscatalog_utils/match/write.py
+++ b/pgscatalog_utils/match/write.py
@@ -7,9 +7,9 @@
 logger = logging.getLogger(__name__)
 
 
-def write_log(df: pl.DataFrame, dataset: str) -> None:
-    logger.debug("Compressing and writing log")
-    with gzip.open(f"{dataset}_log.csv.gz", 'wb') as f:
+def write_log(df: pl.DataFrame, prefix: str) -> None:
+    logger.debug(f"Compressing and writing log: {prefix}_log.csv.gz")
+    with gzip.open(f"{prefix}_log.csv.gz", 'wb') as f:
         df.write_csv(f)
 
 

From 70b16779cbb11f37b2617417428dea47a2759b53 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 8 Sep 2022 12:54:44 +0100
Subject: [PATCH 46/59] distinct -> unique (deprecated)

---
 pgscatalog_utils/match/write.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py
index 32be0cf..d7a0378 100644
--- a/pgscatalog_utils/match/write.py
+++ b/pgscatalog_utils/match/write.py
@@ -92,7 +92,7 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra
     # 2. use cumcount to number duplicate IDs
     # 3. join cumcount data on original DF, use this data for splitting
     ea_count: pl.DataFrame = (df.select(["ID", "effect_allele"])
-    .distinct()
+    .unique()
     .with_columns([
         pl.col("ID").cumcount().over(["ID"]).alias("cumcount"),
         pl.col("ID").count().over(["ID"]).alias("count")

From f232999a86aafd3945cb807a5288bce902ce28c4 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 8 Sep 2022 12:54:57 +0100
Subject: [PATCH 47/59] cast more columns to categorical

---
 pgscatalog_utils/match/read.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index d1824a2..fd1a4c3 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -49,7 +49,10 @@ def read_scorefile(path: str) -> pl.DataFrame:
     logger.debug("Reading scorefile")
     scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str})
                                .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])
-                               .with_column(pl.col('accession').cast(pl.Categorical)))
+                               .with_columns([
+        pl.col('accession').cast(pl.Categorical),
+        pl.col("effect_type").cast(pl.Categorical)]))
+
     return scorefile
 
 

From 02d43989243bddd7c1a7af5d3105416e5d220843 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 8 Sep 2022 16:20:15 +0100
Subject: [PATCH 48/59] use best_match for summary log

---
 pgscatalog_utils/match/label.py | 34 +++++++++++++++++++++------------
 pgscatalog_utils/match/log.py   |  3 ++-
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 1636cb1..5fa8f20 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -15,11 +15,12 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
     - duplicate: True if more than one best match exists for the same accession and ID
     - ambiguous: True if ambiguous
     """
-    labelled = (df.with_column(pl.lit(True).alias('match_candidate'))
-                .pipe(_label_biallelic_ambiguous, remove_ambiguous)
+    labelled = (df.with_column(pl.lit(False).alias('exclude'))  # set up dummy exclude column for _label_*
                 .pipe(_label_best_match)
                 .pipe(_label_duplicate_best_match, keep_first_match)
-                .pipe(_label_duplicate_row_nr))
+                .pipe(_label_duplicate_row_nr)
+                .pipe(_label_biallelic_ambiguous, remove_ambiguous)
+                .with_column(pl.lit(True).alias('match_candidate')))
 
     # encode a new column called match status containing matched, unmatched, and excluded
     return (labelled.with_columns([
@@ -45,15 +46,22 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra
 
     if remove_ambiguous:
         logger.debug("Labelling ambiguous variants with exclude flag")
-        return ambig.with_column(pl.when(pl.col('ambiguous') == True)
-                                 .then(True)
-                                 .otherwise(False)
-                                 .alias('exclude'))
+        return (ambig.with_column(pl.when(pl.col('ambiguous') == True)
+                                  .then(True)
+                                  .otherwise(False)
+                                  .alias('exclude_ambiguous'))
+                .with_column(pl.max(["exclude", "exclude_ambiguous"]))
+                .drop(["exclude", "exclude_ambiguous"])
+                .rename({"max": "exclude"}))
     else:
-        return ambig.with_column(pl.lit(False).alias('exclude'))
+        return (ambig.with_column(pl.lit(False).alias('exclude_ambiguous'))
+                .with_column(pl.max(["exclude", "ambiguous"]))
+                .drop(["exclude", "exclude_ambiguous"])
+                .rename({"max": "exclude"}))
 
 
 def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
+    """ Best matches have the lowest match priority type. Find the best matches and label them.  """
     logger.debug("Labelling best match type (refalt > altref > ...)")
     match_priority = {'refalt': 0, 'altref': 1, 'refalt_flip': 2, 'altref_flip': 3, 'no_oa_ref': 4, 'no_oa_alt': 5,
                       'no_oa_ref_flip': 6, 'no_oa_alt_flip': 7}
@@ -118,7 +126,8 @@ def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl.
 
     # get the horizontal maximum to combine the exclusion columns for each variant
     return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"]))
-            .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"})
+            .drop(["exclude", "exclude_duplicate"])
+            .rename({"max": "exclude"}))
 
 
 def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame:
@@ -150,10 +159,11 @@ def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame:
                                                                                       .over(["accession", "score_row_nr"])))
                                              .then(True)
                                              .otherwise(False)
-                                             .alias('exclude_duplicate'))
+                                             .alias('exclude_duplicate_row_nr'))
                                 .drop('row_nr')
                                 .rename({'score_row_nr': 'row_nr'}))
 
     # get the horizontal maximum to combine the exclusion columns for each variant
-    return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"]))
-            .drop(["exclude", "exclude_duplicate"])).rename({"max": "exclude"})
+    return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_row_nr"]))
+            .drop(["exclude", "exclude_duplicate_row_nr"])
+            .rename({"max": "exclude"}))
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 1882db2..b0a4bf8 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -57,8 +57,9 @@ def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, filte
                            dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
     """ Join match candidates against the original scoring file """
     logger.debug("Making big logs")
+
     # make the summary log using the best matched candidates only
-    summary_log = (scorefile.join(matches.filter(pl.col('match_status') == 'matched'),
+    summary_log = (scorefile.join(matches.filter(pl.col('best_match') == True),
                                   on=['row_nr', 'accession'],
                                   how='outer')  # left join would make checking line count later pointless
                    .with_column(pl.lit(dataset).alias('dataset'))

From 7ad843fa975553412ee0cfe03cef43412dd8ca0f Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 8 Sep 2022 17:04:43 +0100
Subject: [PATCH 49/59] reset best_match flag

---
 pgscatalog_utils/match/label.py | 43 ++++++++++++++++++++-------------
 pgscatalog_utils/match/log.py   | 11 ++++-----
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 5fa8f20..b486a11 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -142,26 +142,35 @@ def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame:
     ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
     │ 38557  ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt  ┆ 3:29588979:T:G ┆ T   ┆ true       │
     └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘
+
+    Label the first row with exclude = false, and duplicate rows with exclude = true and best_match = false
     """
     logger.debug("Labelling duplicated matches with same row_nr")
     labelled: pl.DataFrame = (df.with_column(pl.col('best_match')
-                                               .count()
-                                               .over(['accession', 'row_nr', 'best_match'])
-                                               .alias('count'))
-                                .with_column(pl.when(pl.col('count') > 1)
-                                             .then(pl.lit(True))
-                                             .otherwise(pl.lit(False))
-                                             .alias('duplicate'))
-                                .drop('count')
-                                .rename({'row_nr': 'score_row_nr'})
-                                .with_row_count()  # add temporary row count to get first variant
-                                .with_column(pl.when((pl.col("duplicate") == True) & (pl.col("row_nr") != pl.min("row_nr")
-                                                                                      .over(["accession", "score_row_nr"])))
-                                             .then(True)
-                                             .otherwise(False)
-                                             .alias('exclude_duplicate_row_nr'))
-                                .drop('row_nr')
-                                .rename({'score_row_nr': 'row_nr'}))
+                                             .count()
+                                             .over(['accession', 'row_nr', 'best_match'])
+                                             .alias('count'))
+                              .with_column(pl.when(pl.col('count') > 1)
+                                           .then(pl.lit(True))
+                                           .otherwise(pl.lit(False))
+                                           .alias('duplicate'))
+                              .drop('count')
+                              .rename({'row_nr': 'score_row_nr'})
+                              .with_row_count()  # add temporary row count to get first variant
+                              .with_column(pl.when((pl.col("duplicate") == True) & (pl.col("row_nr") != pl.min("row_nr")
+                                                                                    .over(["accession", "score_row_nr"])))
+                                           .then(True)
+                                           .otherwise(False)
+                                           .alias('exclude_duplicate_row_nr'))
+                              .with_column(pl.when((pl.col("best_match") == True) &
+                                                   (pl.col("duplicate") == True) &
+                                                   (pl.col("row_nr") > pl.min("row_nr")).over(
+                                                       ["accession", "score_row_nr"]))
+                                           .then(False)  # reset best match flag for duplicates
+                                           .otherwise(pl.col("best_match"))  # just keep value from existing column
+                                           .alias('best_match_duplicate_row_nr'))
+                              .drop(['row_nr', 'best_match'])
+                              .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'}))
 
     # get the horizontal maximum to combine the exclusion columns for each variant
     return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_row_nr"]))
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index b0a4bf8..9bbad27 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -6,19 +6,18 @@
 
 
 def make_logs(scorefile, match_candidates, filter_summary, dataset):
-    # best log -> aggregated into summary_log, one match per scoring file line
-    # big log -> written to compressed gzip, possibly multiple matches per scoring file line
+    # summary log -> aggregated from best matches (one per scoring file line)
+    # big log -> unaggregated, written to compressed gzip, possibly multiple matches per scoring file line
     summary_log, big_log = _join_match_candidates(scorefile=scorefile, matches=match_candidates,
                                                   filter_summary=filter_summary,
                                                   dataset=dataset)
 
     # make sure the aggregated best log matches the scoring file accession line count
+    summary_count = (summary_log.groupby(pl.col('accession'))
+                     .agg(pl.sum('count')))
     log_count = (scorefile.groupby("accession")
                  .count()
-                 .join(summary_log
-                       .groupby(pl.col("accession"))
-                       .agg(pl.sum("count")),
-                       on='accession'))
+                 .join(summary_count, on='accession'))
 
     assert (log_count['count'] == log_count['count_right']).all(), "Log doesn't match input scoring file"
     logger.debug("Log matches input scoring file")

From 8af2ac1ade81d38d14af6a6c3618ff9cc2752e49 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 9 Sep 2022 15:53:01 +0100
Subject: [PATCH 50/59] make labelling clearer

---
 pgscatalog_utils/match/label.py | 173 ++++++++++++++++----------------
 pgscatalog_utils/match/log.py   |   9 +-
 2 files changed, 94 insertions(+), 88 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index b486a11..fffd77f 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -17,14 +17,14 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
     """
     labelled = (df.with_column(pl.lit(False).alias('exclude'))  # set up dummy exclude column for _label_*
                 .pipe(_label_best_match)
-                .pipe(_label_duplicate_best_match, keep_first_match)
-                .pipe(_label_duplicate_row_nr)
+                .pipe(_label_duplicate_best_match)
+                .pipe(_label_duplicate_id, keep_first_match)
                 .pipe(_label_biallelic_ambiguous, remove_ambiguous)
                 .with_column(pl.lit(True).alias('match_candidate')))
 
-    # encode a new column called match status containing matched, unmatched, and excluded
+    # encode a new column called match status containing matched, unmatched, excluded, and not_best
     return (labelled.with_columns([
-        # set false best match to excluded
+        # set false best match to not_best
         pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 3}[x]).alias('match_priority'),
         pl.col('exclude').apply(lambda x: {None: 0, True: 2, False: 0}[x]).alias('excluded_match_priority')
     ])
@@ -34,32 +34,6 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
                          .alias('match_status'))).drop(["max", "excluded_match_priority", "match_priority"])
 
 
-def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame:
-    logger.debug("Labelling ambiguous variants")
-    ambig = ((df.with_columns([
-        pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str),
-        pl.lit(True).alias("ambiguous")])
-              .pipe(complement_valid_alleles, ["REF"]))
-             .with_column(pl.when(pl.col("REF_FLIP") == pl.col("ALT"))
-                          .then(pl.col("ambiguous"))
-                          .otherwise(False)))
-
-    if remove_ambiguous:
-        logger.debug("Labelling ambiguous variants with exclude flag")
-        return (ambig.with_column(pl.when(pl.col('ambiguous') == True)
-                                  .then(True)
-                                  .otherwise(False)
-                                  .alias('exclude_ambiguous'))
-                .with_column(pl.max(["exclude", "exclude_ambiguous"]))
-                .drop(["exclude", "exclude_ambiguous"])
-                .rename({"max": "exclude"}))
-    else:
-        return (ambig.with_column(pl.lit(False).alias('exclude_ambiguous'))
-                .with_column(pl.max(["exclude", "ambiguous"]))
-                .drop(["exclude", "exclude_ambiguous"])
-                .rename({"max": "exclude"}))
-
-
 def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
     """ Best matches have the lowest match priority type. Find the best matches and label them.  """
     logger.debug("Labelling best match type (refalt > altref > ...)")
@@ -85,7 +59,56 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
     return prioritised.drop(['match_priority', 'best_match_type'])
 
 
-def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame:
+def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame:
+    """ A scoring file row_nr in an accession group can be duplicated if a target position has different REF, e.g.:
+
+    ┌────────┬────────────────────────┬────────────┬────────────────┬─────┬────────────┐
+    │ row_nr ┆ accession              ┆ match_type ┆ ID             ┆ REF ┆ best_match │
+    │ ---    ┆ ---                    ┆ ---        ┆ ---            ┆ --- ┆ ---        │
+    │ i64    ┆ cat                    ┆ str        ┆ cat            ┆ str ┆ bool       │
+    ╞════════╪════════════════════════╪════════════╪════════════════╪═════╪════════════╡
+    │ 38557  ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt  ┆ 3:29588979:A:G ┆ A   ┆ true       │
+    ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    │ 38557  ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt  ┆ 3:29588979:T:G ┆ T   ┆ true       │
+    └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘
+
+    Label the first row with exclude = false, and duplicate rows with exclude = true and best_match = false
+    """
+    logger.debug("Labelling duplicated best match: flagging first instance with exclude = false")
+    labelled: pl.DataFrame = (df.with_column(pl.col('best_match')
+                                             .count()
+                                             .over(['accession', 'row_nr', 'best_match'])
+                                             .alias('count'))
+                              .with_column(pl.when((pl.col('count') > 1) & (pl.col('best_match') == True))
+                                           .then(pl.lit(True))
+                                           .otherwise(pl.lit(False))
+                                           .alias('duplicate_best_match'))
+                              .drop('count')
+                              .rename({'row_nr': 'score_row_nr'})
+                              .with_row_count()  # add temporary row count to get first variant
+                              .with_column(pl.when((pl.col("duplicate_best_match") == True) &
+                                                   (pl.col("row_nr") != pl.min("row_nr")
+                                                    .over(["accession", "score_row_nr"])))
+                                           .then(True)
+                                           .otherwise(False)
+                                           .alias('exclude_duplicate_best_match'))
+                              .with_column(pl.when((pl.col("best_match") == True) &
+                                                   (pl.col("duplicate_best_match") == True) &
+                                                   (pl.col("row_nr") > pl.min("row_nr")).over(
+                                                       ["accession", "score_row_nr"]))
+                                           .then(False)  # reset best match flag for duplicates
+                                           .otherwise(pl.col("best_match"))  # just keep value from existing column
+                                           .alias('best_match_duplicate_row_nr'))
+                              .drop(['row_nr', 'best_match'])
+                              .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'}))
+
+    # get the horizontal maximum to combine the exclusion columns for each variant
+    return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_best_match"]))
+            .drop(["exclude", "exclude_duplicate_best_match"])
+            .rename({"max": "exclude"}))
+
+
+def _label_duplicate_id(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame:
     """ Label best match duplicates made when the scoring file is remapped to a different genome build
 
     ┌─────────┬────────────────────────┬─────────────┬────────────────┬─────┬────────────┐
@@ -100,29 +123,31 @@ def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl.
 
     refalt > altref > ... prioritisation doesn't fix this problem because row_nr is different (duplicated by remapping)
     """
-    logger.debug('Labelling duplicated best matches')
-    duplicates = (df.with_column(pl.col('best_match')
-                                 .count()
-                                 .over(['accession', 'ID', 'best_match'])
+    logger.debug('Labelling multiple scoring file lines (accession/row_nr) that best_match to the same variant')
+
+    # the window in .over() starts with accession + ID
+    # best_match is added to not count: same row_nr, different match_type (_label_best_match)
+    # duplicate_best_match is added to not count: same row_nr, same match_type (_label_duplicate_best_match)
+    duplicates = (df.with_column(pl.count("ID")
+                                 .over(['accession', 'ID', 'best_match', 'duplicate_best_match'])
                                  .alias('count'))
-                  .with_column(pl.when(pl.col('count') > 1)
+                  .with_column(pl.when((pl.col('count') > 1) & (pl.col('best_match') == True))
                                .then(pl.lit(True))
                                .otherwise(pl.lit(False))
-                               .alias('duplicate'))
-                  .drop('count'))
+                               .alias('duplicate_ID')))
 
     if keep_first_match:
         logger.debug("Keeping first duplicate, labelling others with exclude flag ")
         # set first duplicate (with the smallest row_nr) to exclude = false
-        labelled = duplicates.with_column(pl.when((pl.col("duplicate") == True) &
+        labelled = duplicates.with_column(pl.when((pl.col("duplicate_ID") == True) &
                                                   (pl.col("row_nr") != pl.min("row_nr")
-                                                   .over(["accession", "ID", "duplicate"])))
+                                                   .over(["accession", "ID", "duplicate_ID"])))
                                           .then(True)
                                           .otherwise(False)
                                           .alias('exclude_duplicate'))
     else:
         logger.debug("Labelling all duplicates with exclude flag")
-        labelled = duplicates.with_column(pl.lit(False).alias('exclude_duplicate'))
+        labelled = duplicates.with_column(pl.col('duplicate_ID').alias('exclude_duplicate'))
 
     # get the horizontal maximum to combine the exclusion columns for each variant
     return (labelled.with_column(pl.max(["exclude", "exclude_duplicate"]))
@@ -130,49 +155,29 @@ def _label_duplicate_best_match(df: pl.DataFrame, keep_first_match: bool) -> pl.
             .rename({"max": "exclude"}))
 
 
-def _label_duplicate_row_nr(df: pl.DataFrame) -> pl.DataFrame:
-    """ A scoring file row_nr in an accession group can be duplicated if a target position has different REF, e.g.:
+def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame:
+    logger.debug("Labelling ambiguous variants")
+    ambig = ((df.with_columns([
+        pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str),
+        pl.lit(True).alias("ambiguous")])
+              .pipe(complement_valid_alleles, ["REF"]))
+             .with_column(pl.when(pl.col("REF_FLIP") == pl.col("ALT"))
+                          .then(pl.col("ambiguous"))
+                          .otherwise(False)))
 
-    ┌────────┬────────────────────────┬────────────┬────────────────┬─────┬────────────┐
-    │ row_nr ┆ accession              ┆ match_type ┆ ID             ┆ REF ┆ best_match │
-    │ ---    ┆ ---                    ┆ ---        ┆ ---            ┆ --- ┆ ---        │
-    │ i64    ┆ cat                    ┆ str        ┆ cat            ┆ str ┆ bool       │
-    ╞════════╪════════════════════════╪════════════╪════════════════╪═════╪════════════╡
-    │ 38557  ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt  ┆ 3:29588979:A:G ┆ A   ┆ true       │
-    ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
-    │ 38557  ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt  ┆ 3:29588979:T:G ┆ T   ┆ true       │
-    └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘
+    if remove_ambiguous:
+        logger.debug("Labelling ambiguous variants with exclude flag")
+        return (ambig.with_column(pl.when(pl.col('ambiguous') == True)
+                                  .then(True)
+                                  .otherwise(False)
+                                  .alias('exclude_ambiguous'))
+                .with_column(pl.max(["exclude", "exclude_ambiguous"]))
+                .drop(["exclude", "exclude_ambiguous"])
+                .rename({"max": "exclude"}))
+    else:
+        return (ambig.with_column(pl.lit(False).alias('exclude_ambiguous'))
+                .with_column(pl.max(["exclude", "ambiguous"]))
+                .drop(["exclude", "exclude_ambiguous"])
+                .rename({"max": "exclude"}))
 
-    Label the first row with exclude = false, and duplicate rows with exclude = true and best_match = false
-    """
-    logger.debug("Labelling duplicated matches with same row_nr")
-    labelled: pl.DataFrame = (df.with_column(pl.col('best_match')
-                                             .count()
-                                             .over(['accession', 'row_nr', 'best_match'])
-                                             .alias('count'))
-                              .with_column(pl.when(pl.col('count') > 1)
-                                           .then(pl.lit(True))
-                                           .otherwise(pl.lit(False))
-                                           .alias('duplicate'))
-                              .drop('count')
-                              .rename({'row_nr': 'score_row_nr'})
-                              .with_row_count()  # add temporary row count to get first variant
-                              .with_column(pl.when((pl.col("duplicate") == True) & (pl.col("row_nr") != pl.min("row_nr")
-                                                                                    .over(["accession", "score_row_nr"])))
-                                           .then(True)
-                                           .otherwise(False)
-                                           .alias('exclude_duplicate_row_nr'))
-                              .with_column(pl.when((pl.col("best_match") == True) &
-                                                   (pl.col("duplicate") == True) &
-                                                   (pl.col("row_nr") > pl.min("row_nr")).over(
-                                                       ["accession", "score_row_nr"]))
-                                           .then(False)  # reset best match flag for duplicates
-                                           .otherwise(pl.col("best_match"))  # just keep value from existing column
-                                           .alias('best_match_duplicate_row_nr'))
-                              .drop(['row_nr', 'best_match'])
-                              .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'}))
 
-    # get the horizontal maximum to combine the exclusion columns for each variant
-    return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_row_nr"]))
-            .drop(["exclude", "exclude_duplicate_row_nr"])
-            .rename({"max": "exclude"}))
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 9bbad27..b58bb55 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -29,15 +29,16 @@ def make_summary_log(best_matches, filter_summary):
     """ Make an aggregated table """
     logger.debug("Aggregating best match log into a summary table")
     return (best_matches
-            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate'])
+            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate_best_match',
+                      'duplicate_ID'])
             .count()
             .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'],
                                                                     reverse=True)
 
 
 def _prettify_summary(df: pl.DataFrame):
-    keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", "duplicate",
-                 "count", "percent"]
+    keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic",
+                 "duplicate_best_match", "duplicate_ID", "count", "percent"]
     return (df.with_column((pl.col("count") / pl.sum("count"))
                            .over(["dataset", "accession"])
                            .alias("percent"))
@@ -47,7 +48,7 @@ def _prettify_summary(df: pl.DataFrame):
 def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
     keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight",
                  "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
-                 "ambiguous", "duplicate", "match_status", "dataset"]
+                 "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"]
     pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right")))
     return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"])
 

From c3e98a4b7b9d5aac9b387b81f3a5d0689834bdee Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 12 Sep 2022 11:20:48 +0100
Subject: [PATCH 51/59] don't exclude duplicate best matches, just reset the
 best_match flag

---
 pgscatalog_utils/match/label.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index fffd77f..0dbd71f 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -72,9 +72,9 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame:
     │ 38557  ┆ PGS000012_hmPOS_GRCh37 ┆ no_oa_alt  ┆ 3:29588979:T:G ┆ T   ┆ true       │
     └────────┴────────────────────────┴────────────┴────────────────┴─────┴────────────┘
 
-    Label the first row with exclude = false, and duplicate rows with exclude = true and best_match = false
+    Label the first row with best_match = true, and duplicate rows with best_match = false
     """
-    logger.debug("Labelling duplicated best match: flagging first instance with exclude = false")
+    logger.debug("Labelling duplicated best match: keeping first instance as best_match = True")
     labelled: pl.DataFrame = (df.with_column(pl.col('best_match')
                                              .count()
                                              .over(['accession', 'row_nr', 'best_match'])
@@ -86,12 +86,6 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame:
                               .drop('count')
                               .rename({'row_nr': 'score_row_nr'})
                               .with_row_count()  # add temporary row count to get first variant
-                              .with_column(pl.when((pl.col("duplicate_best_match") == True) &
-                                                   (pl.col("row_nr") != pl.min("row_nr")
-                                                    .over(["accession", "score_row_nr"])))
-                                           .then(True)
-                                           .otherwise(False)
-                                           .alias('exclude_duplicate_best_match'))
                               .with_column(pl.when((pl.col("best_match") == True) &
                                                    (pl.col("duplicate_best_match") == True) &
                                                    (pl.col("row_nr") > pl.min("row_nr")).over(
@@ -102,10 +96,7 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame:
                               .drop(['row_nr', 'best_match'])
                               .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'}))
 
-    # get the horizontal maximum to combine the exclusion columns for each variant
-    return (labelled.with_column(pl.max(["exclude", "exclude_duplicate_best_match"]))
-            .drop(["exclude", "exclude_duplicate_best_match"])
-            .rename({"max": "exclude"}))
+    return labelled
 
 
 def _label_duplicate_id(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame:

From a92383a207aca3cb2778fa1cce2a1ba27d1856d5 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 12 Sep 2022 13:52:53 +0100
Subject: [PATCH 52/59] fix percent

---
 pgscatalog_utils/match/log.py   | 7 +++----
 tests/match/__init__.py         | 0
 tests/match/test_label.py       | 0
 tests/{ => match}/test_match.py | 0
 4 files changed, 3 insertions(+), 4 deletions(-)
 create mode 100644 tests/match/__init__.py
 create mode 100644 tests/match/test_label.py
 rename tests/{ => match}/test_match.py (100%)

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index b58bb55..b214aaa 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -32,14 +32,13 @@ def make_summary_log(best_matches, filter_summary):
             .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate_best_match',
                       'duplicate_ID'])
             .count()
-            .join(filter_summary, how='left', on='accession')).sort(['dataset', 'accession', 'score_pass'],
-                                                                    reverse=True)
+            .join(filter_summary, how='left', on='accession'))
 
 
 def _prettify_summary(df: pl.DataFrame):
     keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic",
                  "duplicate_best_match", "duplicate_ID", "count", "percent"]
-    return (df.with_column((pl.col("count") / pl.sum("count"))
+    return (df.with_column((pl.col("count") / pl.sum("count") * 100)
                            .over(["dataset", "accession"])
                            .alias("percent"))
             .select(keep_cols))
@@ -50,7 +49,7 @@ def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
                  "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
                  "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"]
     pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right")))
-    return pretty_df.sort(["accession", "row_nr", "chr_name", "chr_position"])
+    return pretty_df
 
 
 def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, filter_summary: pl.DataFrame,
diff --git a/tests/match/__init__.py b/tests/match/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/match/test_label.py b/tests/match/test_label.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_match.py b/tests/match/test_match.py
similarity index 100%
rename from tests/test_match.py
rename to tests/match/test_match.py

From e530f1547051e233c1d2f51bda1c6237c6a651a0 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 12 Sep 2022 13:53:15 +0100
Subject: [PATCH 53/59] add _encode_match_priority function

---
 pgscatalog_utils/match/label.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 0dbd71f..e3d059d 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -22,8 +22,12 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
                 .pipe(_label_biallelic_ambiguous, remove_ambiguous)
                 .with_column(pl.lit(True).alias('match_candidate')))
 
-    # encode a new column called match status containing matched, unmatched, excluded, and not_best
-    return (labelled.with_columns([
+    return _encode_match_priority(labelled)
+
+
+def _encode_match_priority(df: pl.DataFrame) -> pl.DataFrame:
+    """ Encode a new column called match status containing matched, unmatched, excluded, and not_best """
+    return (df.with_columns([
         # set false best match to not_best
         pl.col('best_match').apply(lambda x: {None: 0, True: 1, False: 3}[x]).alias('match_priority'),
         pl.col('exclude').apply(lambda x: {None: 0, True: 2, False: 0}[x]).alias('excluded_match_priority')
@@ -31,7 +35,8 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
             .with_column(pl.max(["match_priority", "excluded_match_priority"]))
             .with_column(pl.col("max")
                          .apply(lambda x: {0: 'unmatched', 1: 'matched', 2: 'excluded', 3: 'not_best'}[x])
-                         .alias('match_status'))).drop(["max", "excluded_match_priority", "match_priority"])
+                         .alias('match_status')
+                         .cast(pl.Categorical)).drop(["max", "excluded_match_priority", "match_priority"]))
 
 
 def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:

From c5affc668ff008ae31562a1466462a924a92cd2f Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 12 Sep 2022 16:30:09 +0100
Subject: [PATCH 54/59] fix labels when keeping ambiguous variants

---
 pgscatalog_utils/match/label.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index e3d059d..0d38ccb 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -172,7 +172,7 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra
                 .rename({"max": "exclude"}))
     else:
         return (ambig.with_column(pl.lit(False).alias('exclude_ambiguous'))
-                .with_column(pl.max(["exclude", "ambiguous"]))
+                .with_column(pl.max(["exclude", "exclude_ambiguous"]))
                 .drop(["exclude", "exclude_ambiguous"])
                 .rename({"max": "exclude"}))
 

From 800bf33f8da84c39ec552488a0fbc93851c98df1 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 12 Sep 2022 16:30:25 +0100
Subject: [PATCH 55/59] sort big log

---
 pgscatalog_utils/match/log.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index b214aaa..91f3999 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -48,7 +48,9 @@ def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
     keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight",
                  "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
                  "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"]
-    pretty_df = (df.select(keep_cols).select(pl.exclude("^.*_right")))
+    pretty_df = (df.select(keep_cols)
+                 .select(pl.exclude("^.*_right"))
+                 .sort(["accession", "row_nr", "chr_name", "chr_position"]))
     return pretty_df
 
 

From 1a4a14ef2e2772a080aa353f6a36236927b134e7 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 12 Sep 2022 16:30:36 +0100
Subject: [PATCH 56/59] add label tests

---
 conftest.py               |  40 ++++++++++++
 tests/match/test_label.py | 125 ++++++++++++++++++++++++++++++++++++++
 tests/match/test_match.py |  54 ++++------------
 3 files changed, 177 insertions(+), 42 deletions(-)

diff --git a/conftest.py b/conftest.py
index e322b96..5027d61 100644
--- a/conftest.py
+++ b/conftest.py
@@ -3,11 +3,13 @@
 from unittest.mock import patch
 
 import pandas as pd
+import polars as pl
 import pytest
 import requests as req
 from pysqlar import SQLiteArchive
 
 from pgscatalog_utils.download.download_scorefile import download_scorefile
+from pgscatalog_utils.match.preprocess import complement_valid_alleles
 from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles
 
 
@@ -141,6 +143,44 @@ def hg19_coords(hg38_coords):
     return pd.DataFrame(d)
 
 
+@pytest.fixture
+def small_flipped_scorefile(small_scorefile):
+    # simulate a scorefile on the wrong strand
+    return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
+            .drop(['effect_allele', 'other_allele'])
+            .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
+            .pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
+
+
+@pytest.fixture
+def small_target():
+    return pl.DataFrame({"#CHROM": [1, 2, 3],
+                         "POS": [1, 2, 3],
+                         "REF": ["A", "T", "T"],
+                         "ALT": ["C", "A", "G"],
+                         "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
+                         "is_multiallelic": [False, False, False]})
+
+
+@pytest.fixture
+def small_scorefile():
+    df = pl.DataFrame({"accession": ["test", "test", "test"],
+                       "row_nr": [1, 2, 3],
+                       "chr_name": [1, 2, 3],
+                       "chr_position": [1, 2, 3],
+                       "effect_allele": ["A", "A", "G"],
+                       "other_allele": ["C", "T", "T"],
+                       "effect_weight": [1, 2, 3],
+                       "effect_type": ["additive", "additive", "additive"]})
+
+    return complement_valid_alleles(df, ["effect_allele", "other_allele"])
+
+
+@pytest.fixture
+def small_scorefile_no_oa(small_scorefile):
+    return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
+
+
 def _get_timeout(url):
     try:
         return req.get(url, timeout=5)
diff --git a/tests/match/test_label.py b/tests/match/test_label.py
index e69de29..8198335 100644
--- a/tests/match/test_label.py
+++ b/tests/match/test_label.py
@@ -0,0 +1,125 @@
+""" Test that matches have the correct labels, which is important for edge case handling and summary stats """
+
+import logging
+import pytest
+import polars as pl
+
+from pgscatalog_utils.match.match import get_all_matches
+from tests.match.test_match import _cast_cat
+
+logger = logging.getLogger(__name__)
+
+
+def test_label_best_match(multiple_match_types):
+    """ Test that multiple match candidates are correctly prioritised """
+    # both matches are flagged as candidates
+    assert multiple_match_types['match_candidate'].to_list() == [True, True]
+    # but the matches have different match types
+    assert multiple_match_types['match_type'].to_list() == ["altref", "refalt_flip"]
+    # only one match candidate can survive!
+    assert multiple_match_types['best_match'].to_list() == [True, False]
+    assert multiple_match_types['match_status'].to_list() == ["matched", "not_best"]
+    # however, exclude is _only_ for omitting a 'best match' from the final results, e.g. because of duplication
+    assert multiple_match_types['exclude'].to_list() == [False, False]
+    # match candidates are filtered by best_match == True and exclude == False
+
+
+def test_label(small_scorefile, small_target):
+    """ Test typical labels for match candidates with one match per position """
+    scorefile, target = _cast_cat(small_scorefile, small_target)
+
+    # get_all_matches calls label_matches
+    labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False)
+
+    logger.debug(labelled.select(['ID', 'match_type', 'best_match', 'ambiguous', 'match_status', 'exclude']))
+
+    assert labelled['best_match'].to_list() == [True, True, True]
+    assert labelled['ambiguous'].to_list() == [False, True, False]
+    assert labelled['exclude'].to_list() == [False, True, False]
+    assert labelled['match_status'].to_list() == ["matched", "excluded", "matched"]
+
+
+def test_ambiguous_label(small_flipped_scorefile, small_target):
+    """ Test ambiguous variant labels change when they're kept for match candidates with one match per position """
+    scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
+
+    no_ambiguous = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False)
+
+    assert no_ambiguous['best_match'].to_list() == [True]
+    assert no_ambiguous['ambiguous'].to_list() == [True]
+    assert no_ambiguous['exclude'].to_list() == [True]
+    assert no_ambiguous['match_status'].to_list() == ["excluded"]
+
+    # otherwise, ambiguous variants are kept
+    labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
+
+    assert labelled['best_match'].to_list() == [True]
+    assert labelled['ambiguous'].to_list() == [True]
+    assert labelled['exclude'].to_list() == [False]
+    assert labelled['match_status'].to_list() == ["matched"]
+
+
+def test_duplicate_best_match(duplicated_matches, request):
+    # these matches come from different lines in the original scoring file
+    assert duplicated_matches["row_nr"].to_list() == [1, 4]
+    # but they have the same ID!
+    assert duplicated_matches["ID"].to_list() == ["1:1:A:C", "1:1:A:C"]
+    # and they're matched with the same match type
+    assert duplicated_matches["match_type"].to_list() == ["refalt", "refalt"]
+    # oh dear, they're both the best match
+    assert duplicated_matches["best_match"].to_list() == [True, True]
+    # however, we've flagged them as duplicate IDs
+    assert duplicated_matches['duplicate_ID'].to_list() == [True, True]
+
+    if request.node.callspec.id == "keep_first_match":
+        # and correctly label _the first occurring match_ as best match
+        assert duplicated_matches['exclude'].to_list() == [False, True]
+        assert duplicated_matches['match_status'].to_list() == ["matched", "excluded"]
+    elif request.node.callspec.id == "delete_both":
+        # and correctly labelled all duplicate instances for exclusion (default behaviour)
+        assert duplicated_matches['exclude'].to_list() == [True, True]
+        assert duplicated_matches['match_status'].to_list() == ["excluded", "excluded"]
+
+
+def test_duplicate_best_match(duplicate_best_match):
+    # all best matches come from the same row number in the original scoring file
+    assert duplicate_best_match['row_nr'].to_list() == [1, 1, 1]
+    # and the match type is duplicated, so we can't prioritise
+    assert duplicate_best_match['match_type'].to_list() == ['no_oa_alt', 'no_oa_alt', 'no_oa_ref_flip']
+    # find the duplicate best matches (with the same match type)
+    assert duplicate_best_match['duplicate_best_match'].to_list() == [True, True, False]
+    # and only keep the first occurring best match. the worse match type is correctly set to not_best too.
+    assert duplicate_best_match['match_status'].to_list() == ["matched", "not_best", "not_best"]
+    assert duplicate_best_match['best_match'].to_list() == [True, False, False]
+
+
+@pytest.fixture(params=[True, False], ids=["keep_first_match", "delete_both"])
+def duplicated_matches(small_scorefile, small_target, request):
+    # pgs catalog scorefiles can contain the same variant remapped to multiple rows
+    # this happens after liftover to a different genome build
+    # row_nrs will be different, but other information may be the same
+    dups = (pl.concat([small_scorefile, small_scorefile])
+            .with_column(pl.Series(list(range(1, 7)))
+                         .alias('row_nr'))
+            .filter(pl.col('chr_name') == 1))
+
+    scorefile, target = _cast_cat(dups, small_target)
+
+    return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=request.param)
+
+
+@pytest.fixture
+def multiple_match_types(small_target, small_scorefile):
+    # skip flip will return two candidate matches for one target position: refalt + refalt_flip
+    scorefile, target = _cast_cat(small_scorefile, small_target)
+    return (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
+            .filter(pl.col('chr_name') == 2))
+
+
+@pytest.fixture
+def duplicate_best_match(small_target, small_scorefile_no_oa):
+    # this type of target genome can sometimes occur when the REF is different at the same position
+    odd_target = {'#CHROM': [1, 1], 'POS': [1, 1], 'REF': ['T', 'C'], 'ALT': ['A', 'A'], 'ID': ['1:1:T:C', '1:1:A:A'],
+                  'is_multiallelic': [False, False]}
+    scorefile, target = _cast_cat(small_scorefile_no_oa, pl.DataFrame(odd_target))
+    return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
diff --git a/tests/match/test_match.py b/tests/match/test_match.py
index 42d0e87..2c1c8f4 100644
--- a/tests/match/test_match.py
+++ b/tests/match/test_match.py
@@ -1,3 +1,5 @@
+""" Test that match strategies return the expected match results"""
+
 from unittest.mock import patch
 
 import polars as pl
@@ -5,7 +7,6 @@
 
 from pgscatalog_utils.match.match import get_all_matches, _cast_categorical
 from pgscatalog_utils.match.match_variants import match_variants
-from pgscatalog_utils.match.preprocess import complement_valid_alleles
 
 
 def test_match_fail(combined_scorefile, target_path, tmp_path):
@@ -46,12 +47,14 @@ def test_match_strategies(small_scorefile, small_target):
     scorefile, target = _cast_cat(small_scorefile, small_target)
 
     # check unambiguous matches
-    df = get_all_matches(scorefile, target, skip_flip=True).filter(pl.col('ambiguous') == False)
+    df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
+          .filter(pl.col('ambiguous') == False))
     assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'})
     assert set(df['match_type'].to_list()).issubset(['altref', 'refalt'])
 
     # when keeping ambiguous and flipping alleles
-    flip = (get_all_matches(scorefile, target, skip_flip=False).filter(pl.col('ambiguous') == True))
+    flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
+            .filter(pl.col('ambiguous') == True))
 
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'altref', 'refalt_flip'})
@@ -60,13 +63,14 @@ def test_match_strategies(small_scorefile, small_target):
 def test_no_oa_match(small_scorefile_no_oa, small_target):
     scorefile, target = _cast_cat(small_scorefile_no_oa, small_target)
 
-    df = get_all_matches(scorefile, target, skip_flip=True).filter(pl.col('ambiguous') == False)
+    df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
+          .filter(pl.col('ambiguous') == False))
 
     assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
     assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref'])
 
     # check ambiguous matches
-    flip = (get_all_matches(scorefile, target, skip_flip=False)
+    flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
             .filter(pl.col('ambiguous') == True))
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'no_oa_alt', 'no_oa_ref_flip'})
@@ -75,48 +79,14 @@ def test_no_oa_match(small_scorefile_no_oa, small_target):
 def test_flip_match(small_flipped_scorefile, small_target):
     scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
 
-    df = get_all_matches(scorefile, target, skip_flip=True)
+    df = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
     assert set(df['ambiguous']) == {True}
     assert set(df['match_type']) == {'refalt'}
 
-    flip = get_all_matches(scorefile, target, skip_flip=False).filter(pl.col('ambiguous') == False)
+    flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
+            .filter(pl.col('ambiguous') == False))
     assert flip['match_type'].str.contains('flip').all()
     assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
 
 
-@pytest.fixture
-def small_scorefile():
-    df = pl.DataFrame({"accession": ["test", "test", "test"],
-                       "row_nr": [1, 2, 3],
-                       "chr_name": [1, 2, 3],
-                       "chr_position": [1, 2, 3],
-                       "effect_allele": ["A", "A", "G"],
-                       "other_allele": ["C", "T", "T"],
-                       "effect_weight": [1, 2, 3],
-                       "effect_type": ["additive", "additive", "additive"]})
-
-    return complement_valid_alleles(df, ["effect_allele", "other_allele"])
-
-
-@pytest.fixture
-def small_scorefile_no_oa(small_scorefile):
-    return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
-
-
-@pytest.fixture
-def small_flipped_scorefile(small_scorefile):
-    # simulate a scorefile on the wrong strand
-    return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
-            .drop(['effect_allele', 'other_allele'])
-            .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'})
-            .pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
-
 
-@pytest.fixture
-def small_target():
-    return pl.DataFrame({"#CHROM": [1, 2, 3],
-                         "POS": [1, 2, 3],
-                         "REF": ["A", "T", "T"],
-                         "ALT": ["C", "A", "G"],
-                         "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"],
-                         "is_multiallelic": [False, False, False]})

From 1a1529bdd771218b6438bd3ee7c5b106cb812273 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 12 Sep 2022 17:17:07 +0100
Subject: [PATCH 57/59] use session scopes for fixtures in conftest.py

---
 conftest.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/conftest.py b/conftest.py
index 5027d61..46631c7 100644
--- a/conftest.py
+++ b/conftest.py
@@ -143,7 +143,7 @@ def hg19_coords(hg38_coords):
     return pd.DataFrame(d)
 
 
-@pytest.fixture
+@pytest.fixture(scope='session')
 def small_flipped_scorefile(small_scorefile):
     # simulate a scorefile on the wrong strand
     return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele'])
@@ -152,7 +152,7 @@ def small_flipped_scorefile(small_scorefile):
             .pipe(complement_valid_alleles, ['effect_allele', 'other_allele']))
 
 
-@pytest.fixture
+@pytest.fixture(scope='session')
 def small_target():
     return pl.DataFrame({"#CHROM": [1, 2, 3],
                          "POS": [1, 2, 3],
@@ -162,7 +162,7 @@ def small_target():
                          "is_multiallelic": [False, False, False]})
 
 
-@pytest.fixture
+@pytest.fixture(scope='session')
 def small_scorefile():
     df = pl.DataFrame({"accession": ["test", "test", "test"],
                        "row_nr": [1, 2, 3],
@@ -176,7 +176,7 @@ def small_scorefile():
     return complement_valid_alleles(df, ["effect_allele", "other_allele"])
 
 
-@pytest.fixture
+@pytest.fixture(scope='session')
 def small_scorefile_no_oa(small_scorefile):
     return small_scorefile.with_column(pl.lit(None).alias('other_allele'))
 

From f1886d04bf528af28b4b822f33b8858366fe3123 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 13 Sep 2022 11:28:04 +0100
Subject: [PATCH 58/59] fix removing multiallelic variants with new polars
 version

---
 pgscatalog_utils/match/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index c6dbe47..1723f6d 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -42,7 +42,7 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool)
                 logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic "
                                "variant representations only")
             logger.debug('Dropping multiallelic variants')
-            return df[~df['is_multiallelic']]
+            return df.filter(~df['is_multiallelic'])
         else:
             logger.debug("Exploding dataframe to handle multiallelic variants")
             df.replace('ALT', df['ALT'].str.split(by=','))  # turn ALT to list of variants

From ce8b4beff57bb83aa57fc5d63a0977a46c422951 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Tue, 13 Sep 2022 15:07:53 +0100
Subject: [PATCH 59/59] Fix incorrect deduplication (wasn't using
 matched_effect_allele): this caused splitting into 3 files when a flipped
 variant was also matched

---
 pgscatalog_utils/match/write.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py
index d7a0378..53eb15f 100644
--- a/pgscatalog_utils/match/write.py
+++ b/pgscatalog_utils/match/write.py
@@ -76,7 +76,7 @@ def _split_effect_type(df: pl.DataFrame) -> dict[str, pl.DataFrame]:
 def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFrame]:
     """ Find variant matches that have duplicate identifiers
     When merging a lot of scoring files, sometimes a variant might be duplicated
-    this can happen when the effect allele differs at the same position, e.g.:
+    this can happen when the matched effect allele differs at the same position, e.g.:
         - chr1: chr2:20003:A:C A 0.3 NA
         - chr1: chr2:20003:A:C C NA 0.7
     where the last two columns represent different scores.  plink demands
@@ -85,20 +85,20 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra
     df: A dataframe containing all matches, with columns ID, effect_allele, and
         effect_weight
     Returns:
-        A list of dataframes, with unique ID - effect allele combinations
+        A list of dataframes, with unique ID - matched effect allele combinations
     """
     # 1. unique ID - EA is important because normal duplicates are already
     #   handled by pivoting, and it's pointless to split them unnecessarily
     # 2. use cumcount to number duplicate IDs
     # 3. join cumcount data on original DF, use this data for splitting
-    ea_count: pl.DataFrame = (df.select(["ID", "effect_allele"])
+    ea_count: pl.DataFrame = (df.select(["ID", "matched_effect_allele"])
     .unique()
     .with_columns([
         pl.col("ID").cumcount().over(["ID"]).alias("cumcount"),
         pl.col("ID").count().over(["ID"]).alias("count")
     ]))
 
-    dup_label: pl.DataFrame = df.join(ea_count, on=["ID", "effect_allele"], how="left")
+    dup_label: pl.DataFrame = df.join(ea_count, on=["ID", "matched_effect_allele"], how="left")
 
     # now split the matched variants, and make sure we don't lose any
     n_splits: int = ea_count.select("cumcount").max()[0, 0] + 1  # cumcount = ngroup-1