From 77098befc3a1d692292fbbffb0ff977357c116b4 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 2 Sep 2022 13:16:03 +0100
Subject: [PATCH 01/46] Fist commit of the 'validate' utils

---
 pgscatalog_utils/validate/__init__.py         |   0
 pgscatalog_utils/validate/common_constants.py |  44 +++
 .../validate/formatted/__init__.py            |   0
 .../validate/formatted/validator.py           | 230 +++++++++++
 .../validate/harmonized_position/__init__.py  |   0
 .../validate/harmonized_position/validator.py | 137 +++++++
 pgscatalog_utils/validate/helpers.py          |  29 ++
 pgscatalog_utils/validate/schemas.py          | 158 ++++++++
 .../validate/validate_scorefile.py            | 203 ++++++++++
 pgscatalog_utils/validate/validator_base.py   | 364 ++++++++++++++++++
 10 files changed, 1165 insertions(+)
 create mode 100644 pgscatalog_utils/validate/__init__.py
 create mode 100644 pgscatalog_utils/validate/common_constants.py
 create mode 100644 pgscatalog_utils/validate/formatted/__init__.py
 create mode 100644 pgscatalog_utils/validate/formatted/validator.py
 create mode 100644 pgscatalog_utils/validate/harmonized_position/__init__.py
 create mode 100644 pgscatalog_utils/validate/harmonized_position/validator.py
 create mode 100644 pgscatalog_utils/validate/helpers.py
 create mode 100644 pgscatalog_utils/validate/schemas.py
 create mode 100644 pgscatalog_utils/validate/validate_scorefile.py
 create mode 100644 pgscatalog_utils/validate/validator_base.py

diff --git a/pgscatalog_utils/validate/__init__.py b/pgscatalog_utils/validate/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog_utils/validate/common_constants.py b/pgscatalog_utils/validate/common_constants.py
new file mode 100644
index 0000000..768752a
--- /dev/null
+++ b/pgscatalog_utils/validate/common_constants.py
@@ -0,0 +1,44 @@
+SNP_DSET = 'rsID'
+CHR_DSET = 'chr_name'
+BP_DSET = 'chr_position'
+EFFECT_DSET = 'effect_allele'
+OTH_DSET = 'other_allele'
+EFFECT_WEIGHT_DSET = 'effect_weight'
+
+# Other columns
+LOCUS_DSET = 'locus_name'
+OR_DSET = 'OR'
+HR_DSET = 'HR'
+BETA_DSET = 'beta'
+FREQ_DSET = 'allelefrequency_effect'
+FLAG_INTERACTION_DSET = 'is_interaction'
+FLAG_RECESSIVE_DSET = 'is_recessive'
+FLAG_HAPLOTYPE_DSET = 'is_haplotype'
+FLAG_DIPLOTYPE_DSET = 'is_diplotype'
+METHOD_DSET = 'imputation_method'
+SNP_DESC_DSET = 'variant_description'
+INCLUSION_DSET = 'inclusion_criteria'
+DOSAGE_0_WEIGHT = 'dosage_0_weight'
+DOSAGE_1_WEIGHT = 'dosage_1_weight'
+DOSAGE_2_WEIGHT = 'dosage_2_weight'
+# hmPOS
+HM_SOURCE_DSET = 'hm_source'
+HM_SNP_DSET = 'hm_rsID'	
+HM_CHR_DSET = 'hm_chr'
+HM_BP_DSET = 'hm_pos'
+HM_OTH_DSET = 'hm_inferOtherAllele'
+HM_MATCH_CHR_DSET = 'hm_match_chr'
+HM_MATCH_BP_DSET = 'hm_match_pos'
+# hmFinal
+VARIANT_DSET = 'variant_id'
+HM_CODE_DSET = 'hm_code'
+HM_INFO_DSET = 'hm_info'
+
+
+DSET_TYPES = {SNP_DSET: str, CHR_DSET: str, BP_DSET: int, EFFECT_DSET: str, OTH_DSET: str,
+              EFFECT_WEIGHT_DSET: float, VARIANT_DSET: str, HM_CODE_DSET: int, HM_INFO_DSET: str, LOCUS_DSET: str, OR_DSET: float, HR_DSET: float, BETA_DSET: float, FREQ_DSET: float,
+              FLAG_INTERACTION_DSET: str, FLAG_RECESSIVE_DSET: str, FLAG_HAPLOTYPE_DSET: str, FLAG_DIPLOTYPE_DSET: str,
+              METHOD_DSET: str, SNP_DESC_DSET: str, INCLUSION_DSET: str, DOSAGE_0_WEIGHT: float, DOSAGE_1_WEIGHT: float, DOSAGE_2_WEIGHT: float,
+              HM_SOURCE_DSET:str, HM_SNP_DSET: str, HM_CHR_DSET: str, HM_BP_DSET: int, HM_OTH_DSET: str, HM_MATCH_CHR_DSET: str, HM_MATCH_BP_DSET: int} 
+
+TO_DISPLAY_ORDER = [ SNP_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTH_DSET, EFFECT_WEIGHT_DSET, LOCUS_DSET, OR_DSET, HR_DSET, HM_CODE_DSET, HM_INFO_DSET, HM_SOURCE_DSET, HM_SNP_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET]
\ No newline at end of file
diff --git a/pgscatalog_utils/validate/formatted/__init__.py b/pgscatalog_utils/validate/formatted/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog_utils/validate/formatted/validator.py b/pgscatalog_utils/validate/formatted/validator.py
new file mode 100644
index 0000000..0a5ed3b
--- /dev/null
+++ b/pgscatalog_utils/validate/formatted/validator.py
@@ -0,0 +1,230 @@
+import gzip
+import re
+from pandas_schema import Schema
+from pgscatalog_utils.validate.schemas import *
+from pgscatalog_utils.validate.validator_base import *
+# from schemas import *
+# from validator_base import *
+
+'''
+PGS Catalog Harmonized file validator
+- using pandas_schema https://github.com/TMiguelT/PandasSchema
+'''
+
+class ValidatorFormatted(ValidatorBase):
+
+    def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
+        super().__init__(file, score_dir, logfile, error_limit)
+        self.score_dir=None
+        self.meta_format = FORMATTED_META_GENERIC
+        self.validators = FORMATTED_VALIDATORS
+        self.valid_cols = VALID_COLS_FORMATTED
+        self.valid_type = VALID_TYPE_FORMATTED
+
+    
+    def extract_specific_metadata(self,line):
+        ''' Extract some of the metadata. '''
+        match_variants_number = re.search(r'#variants_number=(\d+)', line)
+        if match_variants_number:
+            self.variants_number = int(match_variants_number.group(1))
+
+
+    def get_and_check_variants_number(self):
+        ''' Verify that the number of variant lines corresponds to the number of variants in the headers '''
+        variant_lines = 0
+        
+        with gzip.open( self.file, 'rb') as f:
+            line_number = 0
+            for line in f:
+                line_number += 1
+                line = line.decode('utf-8').rstrip()
+                if line.startswith('#'):
+                    match_variants_number = re.search(r'#variants_number=(\d+)', line)
+                    if match_variants_number:
+                        self.variants_number = int(match_variants_number.group(1))
+                else:
+                    variant_lines += 1
+                    if re.search('\w+', line): # Line not empty
+                        cols = line.split(self.sep)
+                        has_trailing_spaces = self.check_leading_trailing_spaces(cols,line_number)
+                        if has_trailing_spaces:
+                            self.global_errors += 1
+                    else:
+                        self.logger.error(f'- Line {line_number} is empty')
+                        self.global_errors += 1
+                            
+        if self.variants_number:
+            variant_lines -= 1 # Remove the header line from the count
+            if self.variants_number != variant_lines:
+                self.logger.error(f'- The number of variants lines in the file ({variant_lines}) and the number of variants declared in the headers ({self.variants_number}) are different')
+                self.global_errors += 1
+        else:
+            self.logger.error("- Can't retrieve the number of variants from the headers")
+            self.global_errors += 1
+
+
+    def detect_duplicated_rows(self,dataframe_chunk):
+        ''' Detect duplicated rows in the scoring file. '''
+        # Columns of interest to compare the different rows
+        cols_sel = []
+        for col in ['rsID','chr_name','chr_position','effect_allele','other_allele']:
+            if col in self.cols_to_validate:
+                cols_sel.append(col)
+
+        duplicate_status = dataframe_chunk.duplicated(cols_sel)
+        if any(duplicate_status):
+            duplicated_rows = dataframe_chunk[duplicate_status]
+            self.logger.error(f'Duplicated row(s) found: {len(duplicated_rows.index)}\n\t-> {duplicated_rows.to_string(header=False,index=False)}')
+            self.global_errors += 1
+            for index in duplicated_rows.index:
+                self.bad_rows.append(index)
+
+
+    def validate_data(self):
+        if not self.open_file_and_check_for_squareness():
+            self.logger.error("Please fix the table. Some rows have different numbers of columns to the header")
+            self.logger.info("Rows with different numbers of columns to the header are not validated")
+        # Check the consitence between the declared variants number and the actual number of variants in the file
+        self.get_and_check_variants_number()
+
+        for chunk in self.df_iterator(self.file):
+            to_validate = chunk[self.cols_to_read]
+            to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded
+
+            # Detect duplicated rows
+            self.detect_duplicated_rows(to_validate)
+            # validate the snp column if present
+            if SNP_DSET in self.header:
+                if CHR_DSET and BP_DSET in self.header:
+                    self.schema = Schema([FORMATTED_VALIDATORS_SNP_EMPTY[h] for h in self.cols_to_validate])
+                else:
+                    self.schema = Schema([FORMATTED_VALIDATORS_SNP[h] for h in self.cols_to_validate])
+                errors = self.schema.validate(to_validate)
+                self.store_errors(errors)
+
+            if CHR_DSET and BP_DSET in self.header:
+                self.schema = Schema([FORMATTED_VALIDATORS_POS[h] for h in self.cols_to_validate])
+                errors = self.schema.validate(to_validate)
+                self.store_errors(errors)
+            if OR_DSET in self.header:
+                self.schema = Schema([FORMATTED_VALIDATORS_OR[h] for h in self.cols_to_validate])
+                errors = self.schema.validate(to_validate)
+                self.store_errors(errors)
+            if HR_DSET in self.header:
+                self.schema = Schema([FORMATTED_VALIDATORS_HR[h] for h in self.cols_to_validate])
+                errors = self.schema.validate(to_validate)
+                self.store_errors(errors)
+            self.process_errors()
+            if len(self.bad_rows) >= self.error_limit:
+                break
+        if not self.bad_rows and not self.global_errors:
+            self.logger.info("File is valid")
+            return True
+
+        else:
+            self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit))
+            return False
+
+
+    def validate_filename(self):
+        filename = self.file.split('/')[-1].split('.')[0]
+        if re.match('^PGS\d{6}$', filename):
+            return True
+        else:
+            self.logger.error("Filename: {} should follow the pattern 'PGSXXXXXX.txt.gz', where the 'X' are the 6 digits of the PGS identifier (e.g. PGS000001)".format(filename))
+            return False
+
+
+    def validate_headers(self):
+        self.setup_field_validation()
+        self.detect_genomebuild_with_rsid()
+        required_is_subset = set(STD_COLS_VAR_FORMATTED).issubset(self.header)
+        if not required_is_subset:
+            # check if everything but snp:
+            required_is_subset = set(CHR_COLS_VAR_FORMATTED).issubset(self.header)
+            if not required_is_subset:
+                required_is_subset = set(SNP_COLS_VAR_FORMATTED).issubset(self.header)
+            if not required_is_subset:
+                self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_FORMATTED, self.header))
+
+        # Check if at least one of the effect columns is there
+        has_effect_col = 0
+        for col in STD_COLS_EFFECT_FORMATTED:
+            if set([col]).issubset(self.header):
+                has_effect_col = 1
+                break
+        if not has_effect_col:
+            self.logger.error("Required headers: at least one of the columns '{}' must be in the file header: {}".format(STD_COLS_EFFECT_FORMATTED, self.header))
+            required_is_subset = None
+
+        return required_is_subset
+
+
+    def detect_genomebuild_with_rsid(self):
+        ''' The column "rsID" should always be in the scoring file when the genome build is not reported (i.e. "NR") '''
+        self.get_genomebuild()
+        if self.genomebuild == 'NR':
+            if SNP_DSET not in self.header:
+                self.logger.error(f"- The combination: Genome Build = '{self.genomebuild}' & the missing column '{SNP_DSET}' in the header is not allowed as we have to manually guess the genome build.")
+                self.global_errors += 1
+
+
+    def get_genomebuild(self):
+        ''' Retrieve the Genome Build from the comments '''
+        with gzip.open(self.file, 'rb') as f_in:
+            for f_line in f_in:
+                line = f_line.decode()
+                # Update header
+                if line.startswith('#genome_build'):
+                    gb = (line.split('='))[1]
+                    self.genomebuild = gb.strip()
+                    return
+
+
+##################################################################
+
+def init_validator(file, logfile, score_dir=None) -> ValidatorFormatted:
+    validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile)
+    return validator
+
+# def run_validator(file, check_filename, logfile, score_dir=None):
+
+#     validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile)
+
+#     validator.logger.propagate = False
+
+#     if not file or not logfile:
+#         validator.logger.info("Missing file and/or logfile")
+#         validator.logger.info("Exiting before any further checks")
+#         sys.exit()
+#     if not os.path.exists(file):
+#         validator.logger.info("Error: the file '"+file+"' can't be found")
+#         validator.logger.info("Exiting before any further checks")
+#         sys.exit()
+
+#     is_ok_to_run_validation = 1
+#     validator.logger.info("Validating file extension...")
+#     if not validator.validate_file_extension():
+#         validator.logger.info("Invalid file extension: {}".format(file))
+#         validator.logger.info("Exiting before any further checks")
+#         is_ok_to_run_validation = 0
+
+#     if is_ok_to_run_validation and check_filename:
+#         validator.logger.info("Validating file name...")
+#         if not validator.validate_filename():
+#             validator.logger.info("Invalid filename: {}".format(file))
+#             is_ok_to_run_validation = 0
+
+#     if is_ok_to_run_validation:
+#         validator.logger.info("Validating headers...")
+#         if not validator.validate_headers():
+#             validator.logger.info("Invalid headers...exiting before any further checks")
+#             is_ok_to_run_validation = 0
+
+#     if is_ok_to_run_validation:
+#         validator.logger.info("Validating data...")
+#         validator.validate_data()
+
+#     # Close log handler
+#     validator.logger.removeHandler(validator.handler)
+#     validator.handler.close()
\ No newline at end of file
diff --git a/pgscatalog_utils/validate/harmonized_position/__init__.py b/pgscatalog_utils/validate/harmonized_position/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog_utils/validate/harmonized_position/validator.py b/pgscatalog_utils/validate/harmonized_position/validator.py
new file mode 100644
index 0000000..c12ca58
--- /dev/null
+++ b/pgscatalog_utils/validate/harmonized_position/validator.py
@@ -0,0 +1,137 @@
+import re
+from pgscatalog_utils.validate.schemas import *
+from pgscatalog_utils.validate.validator_base import *
+
+'''
+PGS Catalog Harmonized file validator
+- using pandas_schema https://github.com/TMiguelT/PandasSchema
+'''
+
+class ValidatorPos(ValidatorBase):
+    ''' Validator for the HmPOS Harmonized file format. '''
+
+    def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
+        super().__init__(file, score_dir, logfile, error_limit)
+        self.meta_format = HM_META_POS
+        self.validators = POS_VALIDATORS
+        self.valid_cols = VALID_COLS_POS
+        self.valid_type = VALID_TYPE_POS
+
+
+    def extract_specific_metadata(self,line):
+        ''' Extract some of the metadata. '''
+        match_variants_number = re.search(r'#variants_number=(\d+)', line)
+        if match_variants_number:
+            self.variants_number = int(match_variants_number.group(1))
+
+
+    def validate_line_content(self,cols_content,var_line_number):
+        ''' Populate the abstract method from ValidatorBase, to check some data in esch row. '''
+        # Check lines
+        line_dict = dict(zip(self.header, cols_content))
+        line_cols = line_dict.keys()
+        # Check each chromosome data is consistent
+        chr_cols = ['chr_name', 'hm_chr', 'hm_match_chr']
+        if all(col_name in line_cols for col_name in chr_cols):
+            if line_dict['chr_name'] == line_dict['hm_chr'] and line_dict['hm_match_chr'] != 'True':
+                self.logger.error(f"- Variant line {var_line_number} | 'hm_match_chr' should be 'True': same chromosome ('chr_name={line_dict['chr_name']}' vs 'hm_chr={line_dict['hm_chr']}')")
+        # Check each position data is consistent
+        pos_cols = ['chr_position', 'hm_pos', 'hm_match_pos']
+        if all(col_name in line_cols for col_name in pos_cols):
+            if line_dict['chr_position'] == line_dict['hm_pos'] and line_dict['hm_match_pos'] != 'True':
+                self.logger.error(f"- Variant line {var_line_number} | 'hm_match_pos' should be 'True': same position ('chr_position={line_dict['chr_position']}' vs 'hm_pos={line_dict['hm_pos']}')")
+
+
+    def validate_filename(self):
+        ''' Validate the file name structure. '''
+        pgs_id, build = None, None
+        # hmPOS
+        filename = self.file.split('/')[-1].split('.')[0]
+        filename_parts = filename.split('_hmPOS_')
+        if len(filename_parts) != 2:
+            self.logger.error("Filename: {} should follow the pattern <pgs_id>_hmPOS_<build>.txt.gz [build=GRChXX]".format(filename))
+            return False
+        else:
+            pgs_id, build = filename_parts
+        self.file_pgs_id = pgs_id
+        self.file_genomebuild = build
+        if not self.check_build_is_legit(build):
+            self.logger.error("Build: {} is not an accepted build value".format(build))
+            return False
+        self.logger.info("Filename looks good!")
+        return True
+
+
+    def validate_headers(self):
+        ''' Validate the list of column names. '''
+        # Check if it has at least a "SNP" column or a "chromosome" column
+        self.setup_field_validation()
+        required_is_subset = set(STD_COLS_VAR_POS).issubset(self.header)
+        if not required_is_subset:
+            self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_POS, self.header))
+       
+        # Check if it has at least a "SNP" column or a "chromosome" column
+        required_pos = set(SNP_COLS_VAR_POS).issubset(self.header)
+        if not required_pos:
+            # check if everything but snp:
+            required_pos = set(CHR_COLS_VAR_POS).issubset(self.header)
+            if not required_pos:
+                self.logger.error("One of the following required header is missing: '{}' and/or '{}' are not in the file header: {}".format(SNP_COLS_VAR_POS, CHR_COLS_VAR_POS, self.header))
+                required_is_subset = required_pos
+        
+        return required_is_subset
+
+
+##################################################################
+
+def init_validator(file, logfile, score_dir=None) -> ValidatorPos:
+    validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile)
+    return validator
+
+# def run_validator(file, check_filename, logfile, score_dir=None):
+
+#     validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile)
+
+#     validator.logger.propagate = False
+
+#     if not file or not logfile:
+#         validator.logger.info("Missing file and/or logfile")
+#         validator.logger.info("Exiting before any further checks")
+#         sys.exit()
+#     if not os.path.exists(file):
+#         validator.logger.info("Error: the file '"+file+"' can't be found")
+#         validator.logger.info("Exiting before any further checks")
+#         sys.exit()
+
+#     is_ok_to_run_validation = 1
+#     validator.logger.info("Validating file extension...")
+#     if not validator.validate_file_extension():
+#         validator.logger.info("Invalid file extension: {}".format(file))
+#         validator.logger.info("Exiting before any further checks")
+#         is_ok_to_run_validation = 0
+
+#     if is_ok_to_run_validation and check_filename:
+#         validator.logger.info("Validating file name...")
+#         if not validator.validate_filename():
+#             validator.logger.info("Invalid filename: {}".format(file))
+#             is_ok_to_run_validation = 0
+
+#     if is_ok_to_run_validation:
+#         validator.logger.info("Comparing filename with metadata...")
+#         if not validator.compare_with_filename():
+#             validator.logger.info("Discrepancies between filename information and metadata: {}".format(file))
+#             is_ok_to_run_validation = 0
+
+#     if is_ok_to_run_validation:
+#         validator.logger.info("Validating headers...")
+#         if not validator.validate_headers():
+#             validator.logger.info("Invalid headers...exiting before any further checks")
+#             is_ok_to_run_validation = 0
+
+#     if is_ok_to_run_validation:
+#         validator.logger.info("Validating data...")
+#         validator.validate_data()
+
+#     # Close log handler
+#     validator.logger.removeHandler(validator.handler)
+#     validator.handler.close()
\ No newline at end of file
diff --git a/pgscatalog_utils/validate/helpers.py b/pgscatalog_utils/validate/helpers.py
new file mode 100644
index 0000000..7d786e5
--- /dev/null
+++ b/pgscatalog_utils/validate/helpers.py
@@ -0,0 +1,29 @@
+import math
+import pandas as pd
+from pandas_schema.validation import _SeriesValidation
+
+
+class InInclusiveRangeValidation(_SeriesValidation):
+    """
+    Checks that each element in the series is within a given inclusive numerical range.
+    Doesn't care if the values are not numeric - it will try anyway.
+    """
+    def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs):
+        """
+        :param min: The minimum (inclusive) value to accept
+        :param max: The maximum (inclusive) value to accept
+        """
+        self.min = min
+        self.max = max
+        super().__init__(**kwargs)
+
+    @property
+    def default_message(self):
+        return 'was not in the range [{}, {})'.format(self.min, self.max)
+
+    def validate(self, series: pd.Series) -> pd.Series:
+        series = pd.to_numeric(series, errors='coerce')
+        return (series >= self.min) & (series <= self.max)
+
+
+
diff --git a/pgscatalog_utils/validate/schemas.py b/pgscatalog_utils/validate/schemas.py
new file mode 100644
index 0000000..7487b21
--- /dev/null
+++ b/pgscatalog_utils/validate/schemas.py
@@ -0,0 +1,158 @@
+import sys
+import numpy as np
+from pandas_schema import Column
+from pandas_schema.validation import MatchesPatternValidation, InListValidation, CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation, CustomElementValidation
+from pgscatalog_utils.validate.helpers import InInclusiveRangeValidation
+from pgscatalog_utils.validate.common_constants import *
+
+
+#### Validation types ####
+
+VALID_TYPE_FORMATTED = 'formatted'
+VALID_TYPE_POS = 'hm_pos'
+
+
+#### Columns ####
+
+# Formatted scoring files
+STD_COLS_VAR_FORMATTED = (EFFECT_DSET, CHR_DSET, BP_DSET, SNP_DSET) #OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, SE_DSET, FREQ_DSET , EFFECT_DSET, OTH_DSET)
+
+SNP_COLS_VAR_FORMATTED = (EFFECT_DSET, CHR_DSET, BP_DSET)
+CHR_COLS_VAR_FORMATTED = (EFFECT_DSET, SNP_DSET)
+
+STD_COLS_EFFECT_FORMATTED = (EFFECT_WEIGHT_DSET,OR_DSET,HR_DSET)
+
+VALID_COLS_FORMATTED = (EFFECT_WEIGHT_DSET, OR_DSET, HR_DSET, BETA_DSET, FREQ_DSET, LOCUS_DSET, EFFECT_DSET, OTH_DSET, CHR_DSET, BP_DSET, SNP_DSET)
+
+# Harmonized scoring files - POS
+STD_COLS_VAR_POS = (HM_SOURCE_DSET, HM_CHR_DSET, HM_BP_DSET)
+
+SNP_COLS_VAR_POS = (SNP_DSET, HM_SNP_DSET)
+CHR_COLS_VAR_POS = (CHR_DSET,)
+
+VALID_COLS_POS = (HM_SOURCE_DSET, HM_SNP_DSET, HM_CHR_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET)
+
+# Harmonized scoring files - Final
+STD_COLS_VAR_FINAL = (EFFECT_DSET, EFFECT_WEIGHT_DSET, HM_CODE_DSET, HM_INFO_DSET)
+
+SNP_COLS_VAR_FINAL = (VARIANT_DSET,)
+CHR_COLS_VAR_FINAL = (CHR_DSET, HM_CHR_DSET)
+
+VALID_COLS_FINAL = (SNP_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTH_DSET, EFFECT_WEIGHT_DSET, LOCUS_DSET, HM_CODE_DSET, HM_SNP_DSET, HM_CHR_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET)
+
+
+#### Global variables ####
+
+VALID_CHROMOSOMES = ['1', '2', '3', '4', '5', '6', '7', '8',
+                     '9', '10', '11', '12', '13', '14', '15', '16',
+                     '17', '18', '19', '20', '21', '22',
+                     'X', 'x', 'Y', 'y', 'XY', 'xy', 'MT', 'Mt', 'mt']
+
+VALID_FILE_EXTENSIONS = [".txt", ".txt.gz"]
+
+# For the harmonized files
+VALID_SOURCES = ['ENSEMBL','Author-reported']
+# VALID_CODES = ['5','4','3','1','0','-1','-4','-5']
+BUILD_LIST = ['GRCh37','GRCh38']
+
+
+error_msg = 'this column cannot be null/empty' 
+null_validation = CustomElementValidation(lambda d: d is not np.nan and d != '', error_msg)
+
+
+#### Validators ####
+
+# Generic/shared validators
+GENERIC_VALIDATORS = {
+    CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
+    BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True),
+    EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET]), null_validation], allow_empty=False),
+    EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=False),
+    OTH_DSET: Column(OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=True),
+    LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), null_validation], allow_empty=True)
+}
+
+# Formatted validators
+FORMATTED_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
+FORMATTED_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs|HLA\-\w+\*)[0-9]+$')], allow_empty=True)
+FORMATTED_VALIDATORS[OR_DSET] = Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[BETA_DSET] = Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[FREQ_DSET] = Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[DOSAGE_0_WEIGHT] = Column(DOSAGE_0_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_0_WEIGHT]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[DOSAGE_1_WEIGHT] = Column(DOSAGE_1_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_1_WEIGHT]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[DOSAGE_2_WEIGHT] = Column(DOSAGE_2_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_2_WEIGHT]), null_validation], allow_empty=True)
+
+FORMATTED_VALIDATORS_SNP = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_SNP[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs|HLA\-\w+\*)[0-9]+$')], allow_empty=False)
+
+FORMATTED_VALIDATORS_SNP_EMPTY = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_SNP_EMPTY[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs[0-9]+|HLA\-\w+\*[0-9]+|nan)$')], allow_empty=False)
+FORMATTED_VALIDATORS_SNP_EMPTY[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False)
+FORMATTED_VALIDATORS_SNP_EMPTY[BP_DSET]  = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False)
+
+FORMATTED_VALIDATORS_POS = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_POS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False)
+FORMATTED_VALIDATORS_POS[BP_DSET]  = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False)
+
+FORMATTED_VALIDATORS_OR = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_OR[OR_DSET] = Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=False)
+
+FORMATTED_VALIDATORS_HR = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_HR[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=False)
+
+# Position validators
+POS_VALIDATORS = {}
+POS_VALIDATORS[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET]), null_validation], allow_empty=True)
+POS_VALIDATORS[HM_SOURCE_DSET] = Column(HM_SOURCE_DSET, [CanConvertValidation(DSET_TYPES[HM_SOURCE_DSET]), InListValidation(VALID_SOURCES), LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), null_validation], allow_empty=False)
+POS_VALIDATORS[HM_SNP_DSET] = Column(HM_SNP_DSET, [CanConvertValidation(DSET_TYPES[HM_SNP_DSET]), MatchesPatternValidation(r'^(rs|HLA\-\w+\*)[0-9]+$')], allow_empty=True)
+POS_VALIDATORS[HM_CHR_DSET] = Column(HM_CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True)
+POS_VALIDATORS[HM_BP_DSET] = Column(HM_BP_DSET, [CanConvertValidation(DSET_TYPES[HM_BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True)
+POS_VALIDATORS[HM_OTH_DSET] =  Column(HM_OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-\/]+$')], allow_empty=True)
+POS_VALIDATORS[HM_MATCH_CHR_DSET] = Column(HM_MATCH_CHR_DSET, [InListValidation(['True', 'False'])], allow_empty=True)
+POS_VALIDATORS[HM_MATCH_BP_DSET] = Column(HM_MATCH_BP_DSET, [InListValidation(['True', 'False'])], allow_empty=True)
+
+# Final validator
+# FINAL_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
+# FINAL_VALIDATORS[EFFECT_DSET] = Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=True)
+# FINAL_VALIDATORS[OTH_DSET] = Column(OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-\.]+$')], allow_empty=True)
+# FINAL_VALIDATORS[VARIANT_DSET] = Column(VARIANT_DSET, [CanConvertValidation(DSET_TYPES[VARIANT_DSET]), MatchesPatternValidation(r'^((rs|HLA\-\w+\*)[0-9]+|\.)$')], allow_empty=True)
+# FINAL_VALIDATORS[HM_CODE_DSET] = Column(HM_CODE_DSET, [InListValidation(VALID_CODES), null_validation], allow_empty=True)
+# FINAL_VALIDATORS[HM_INFO_DSET] = Column(HM_INFO_DSET, [CanConvertValidation(DSET_TYPES[HM_INFO_DSET]), null_validation], allow_empty=True)
+
+
+#### Metadata entries ####
+
+FORMATTED_META_GENERIC = [
+    '###PGS CATALOG SCORING FILE',
+    '#format_version',
+    '##POLYGENIC SCORE',
+    '#pgs_id',
+    '#pgs_name',
+    '#trait_reported',
+    '#trait_mapped',
+    '#trait_efo',
+    '#genome_build',
+    '#variants_number',
+    '#weight_type',
+    '##SOURCE INFORMATION',
+    '#pgp_id',
+    '#citation'
+]
+
+HM_META_GENERIC = [ x for x in FORMATTED_META_GENERIC ]
+HM_META_GENERIC.append('##HARMONIZATION DETAILS')
+
+HM_META_POS = [ x for x in HM_META_GENERIC ]
+HM_META_POS.append('#HmPOS_build')
+HM_META_POS.append('#HmPOS_date')
+HM_META_POS.append('#HmPOS_match_chr')
+HM_META_POS.append('#HmPOS_match_pos')
+
+# HM_META_FINAL = [ x for x in HM_META_GENERIC ]
+# HM_META_FINAL.append('#Hm_file_version')
+# HM_META_FINAL.append('#Hm_genome_build')
+# HM_META_FINAL.append('#Hm_reference_source')
+# HM_META_FINAL.append('#Hm_creation_date')
+# HM_META_FINAL.append('#Hm_variants_number_matched')
+# HM_META_FINAL.append('#Hm_variants_number_unmapped')
\ No newline at end of file
diff --git a/pgscatalog_utils/validate/validate_scorefile.py b/pgscatalog_utils/validate/validate_scorefile.py
new file mode 100644
index 0000000..f31ef88
--- /dev/null
+++ b/pgscatalog_utils/validate/validate_scorefile.py
@@ -0,0 +1,203 @@
+import os, glob, re
+import argparse
+import logging
+
+data_sum = {'valid': [], 'invalid': [], 'other': []}
+
+val_types = ('formatted', 'hm_pos')
+
+logging.basicConfig(level=logging.INFO, format='(%(levelname)s): %(message)s')
+
+def _read_last_line(file: str) -> str:
+    '''
+    Return the last line of the file
+    '''
+    fileHandle = open ( file,"r" )
+    lineList = fileHandle.readlines()
+    fileHandle.close()
+    return lineList[-1]
+
+
+def _file_validation_state(filename: str, log_file: str) -> None:
+    global data_sum
+    if os.path.exists(log_file):
+        log_result = _read_last_line(log_file)
+        if re.search("File is valid", log_result):
+            print("> valid\n")
+            data_sum['valid'].append(filename)
+        elif re.search("File is invalid", log_result):
+            print("#### invalid! ####\n")
+            data_sum['invalid'].append(filename)
+        else:#
+            print("!! validation process had an issue. Please look at the logs.\n")
+            data_sum['other'].append(filename)
+    else:
+        print("!! validation process had an issue: the log file can't be found")
+        data_sum['other'].append(filename)
+
+
+def _run_validator(validator: object, file: str, check_filename: bool, logfile: str, validator_type: str) -> None:
+    ''' Main method to run the PGS file validator '''
+    validator.logger.propagate = False
+
+    is_ok_to_continue_validation = 1
+
+    # Check files exist
+    if not file or not logfile:
+        validator.logger.info("Missing file and/or logfile")
+        is_ok_to_continue_validation = 0
+    elif file and not os.path.exists(file):
+        validator.logger.info("Error: the file '"+file+"' can't be found")
+        is_ok_to_continue_validation = 0
+
+    # Validate file extension
+    validator.logger.info("Validating file extension...")
+    if not validator.validate_file_extension():
+        validator.logger.info("Invalid file extension: {}".format(file))
+        is_ok_to_continue_validation = 0
+    # Validate file name nomenclature
+    if is_ok_to_continue_validation and check_filename:
+        validator.logger.info("Validating file name...")
+        if not validator.validate_filename():
+            validator.logger.info("Invalid filename: {}".format(file))
+            is_ok_to_continue_validation = 0
+
+    # Only for harmonized files
+    if is_ok_to_continue_validation and validator_type != 'formatted':
+        validator.logger.info("Comparing filename with metadata...")
+        if not validator.compare_with_filename():
+            validator.logger.info("Discrepancies between filename information and metadata: {}".format(file))
+            is_ok_to_continue_validation = 0
+
+    # Validate column headers
+    if is_ok_to_continue_validation:
+        validator.logger.info("Validating headers...")
+        if not validator.validate_headers():
+            validator.logger.info("Invalid headers...exiting before any further checks")
+            is_ok_to_continue_validation = 0
+
+    # Validate data content
+    if is_ok_to_continue_validation:
+        validator.logger.info("Validating data...")
+        validator.validate_data()
+
+    if is_ok_to_continue_validation == 0:
+        validator.logger.info("Exiting before any further checks")
+
+    # Close log handler
+    validator.logger.removeHandler(validator.handler)
+    validator.handler.close()
+
+
+def _check_args(args):
+    global score_dir
+
+    ## Check parameters ##
+    # Type of validator
+    if args.t not in val_types:
+        print(f"Error: Validator type (option -t) '{args.t}' is not in the list of recognized types: {val_types}.")
+        exit(1)
+    # Logs dir
+    if not os.path.isdir(args.log_dir):
+        print(f"Error: Log dir '{args.log_dir}' can't be found!")
+        exit(1)
+    # File and directory parameters (only one of the '-f' and '--dir' can be used)
+    if args.f and args.dir:
+        print("Error: you can't use both options [-f] - single scoring file and [--dir] - directory of scoring files. Please use only 1 of these 2 options!")
+        exit(1)
+    elif not args.f and not args.dir:
+        print("Error: you need to provide a scoring file [-f] or a directory of scoring files [--dir]!")
+        exit(1)
+    elif args.f and not os.path.isfile(args.f):
+        print(f"Error: Scoring file '{args.f}' can't be found!")
+        exit(1)
+    elif args.dir and not os.path.isdir(args.dir):
+        print(f"Error: the scoring file directory '{args.dir}' can't be found!")
+        exit(1)
+    # Scoring files directory (only to compare with the harmonized files)
+    score_dir = None
+    if args.score_dir:
+        score_dir = args.score_dir
+        if not os.path.isdir(score_dir):
+            print(f"Error: Scoring file directory '{score_dir}' can't be found!")
+            exit(1)
+    elif args.t != 'formatted':
+        print("WARNING: the parameter '--score_dir' is not present in the submitted command line, therefore the comparison of the number of data rows between the formatted scoring file(s) and the harmonized scoring file(s) won't be performed.")
+
+
+def validate_file(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None:
+    ''' Run the file validator '''
+    file = os.path.basename(filepath)
+    filename = file.split('.')[0]
+    print(f"# Filename: {file}")
+    log_file = log_dir+'/'+filename+'_log.txt'
+
+    # Run validator
+    validator = validator_package.init_validator(filepath,log_file,score_dir)
+    _run_validator(validator,filepath,check_filename,log_file,validator_type)
+
+    # Check log
+    _file_validation_state(file,log_file)
+
+
+def main():
+    global data_sum, score_dir
+
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE')
+    argparser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME')
+    argparser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option')
+    argparser.add_argument('--score_dir', help='<Optional> The name of the directory containing the formatted scoring files to compare with harmonized scoring files')
+    argparser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True)
+    argparser.add_argument('--check_filename', help='<Optional> Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true')
+   
+    args = argparser.parse_args()
+    
+    ## Check parameters ##
+    _check_args(args)
+
+    # Check PGS Catalog file name nomenclature
+    check_filename = False
+    if args.check_filename:
+        check_filename = True
+    else:
+        print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.")
+
+    validator_type = args.t
+    files_dir = args.dir
+
+    log_dir = args.log_dir
+
+    ## Select validator class ##
+    if validator_type == 'formatted':
+        import pgscatalog_utils.validate.formatted.validator as validator_package
+    elif validator_type == 'hm_pos':
+        import pgscatalog_utils.validate.harmonized_position.validator as validator_package
+
+    ## Run validator ##
+    # One file
+    if args.f:
+        validate_file(args.f,log_dir,score_dir,validator_package,check_filename,validator_type)
+    # Content of the directory
+    elif files_dir:
+        count_files = 0
+        # Browse directory: for each file run validator
+        for filepath in sorted(glob.glob(files_dir+"/*.*")):
+            validate_file(filepath,log_dir,score_dir,validator_package,check_filename,validator_type)
+            count_files += 1
+
+        # Print summary  + results
+        print("\nSummary:")
+        if data_sum['valid']:
+            print(f"- Valid: {len(data_sum['valid'])}/{count_files}")
+        if data_sum['invalid']:
+            print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}")
+        if data_sum['other']:
+            print(f"- Other issues: {len(data_sum['other'])}/{count_files}")
+
+        if data_sum['invalid']:
+            print("Invalid files:")
+            print("\n".join(data_sum['invalid']))
+
+if __name__ == '__main__':
+    main()
diff --git a/pgscatalog_utils/validate/validator_base.py b/pgscatalog_utils/validate/validator_base.py
new file mode 100644
index 0000000..f76adf1
--- /dev/null
+++ b/pgscatalog_utils/validate/validator_base.py
@@ -0,0 +1,364 @@
+import os, sys, gc
+import gzip
+import csv
+import pathlib
+import logging
+import re
+from typing import List
+import pandas as pd
+import pandas_schema
+from pgscatalog_utils.validate.schemas import *
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='pandas_schema')
+
+'''
+PGS Catalog file validator
+- using pandas_schema https://github.com/TMiguelT/PandasSchema
+'''
+
+
+csv.field_size_limit(sys.maxsize)
+
+class ValidatorBase:
+
+    valid_extensions = VALID_FILE_EXTENSIONS
+    validators = GENERIC_VALIDATORS
+    valid_cols = []
+    valid_type = ''
+    sep = '\t'
+
+    def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
+        self.file = file
+        self.score_dir = score_dir
+        self.schema = None
+        self.header = []
+        self.genomebuild = None
+        self.comment_lines_count = 1 # Counting the header line
+        self.cols_to_validate = []
+        self.cols_to_read = []
+        self.bad_rows = []
+        self.row_errors = []
+        self.errors_seen = {}
+        self.logfile = logfile
+        self.error_limit = int(error_limit)
+
+        # Logging variables
+        self.logger = logging.getLogger(__name__)
+        self.handler = logging.FileHandler(self.logfile, 'w+')
+        self.handler.setLevel(logging.INFO)
+        self.logger.addHandler(self.handler)
+        self.logger.propagate = False
+
+        self.global_errors = 0
+        self.variants_number = 0
+
+
+    def setup_field_validation(self):
+        '''
+        Fetch the header and build the list of column to check/validate
+        '''
+        self.header = self.get_header()
+        self.cols_to_validate = [h for h in self.header if h in self.valid_cols]
+        self.cols_to_read = [h for h in self.header if h in self.valid_cols]
+
+
+    def get_header(self):
+        '''
+        Fetch the header (i.e. column names) information from the harmonized scoring file and store the list in a variable
+        '''
+        first_row = pd.read_csv(self.file, sep=self.sep, comment='#', nrows=1, index_col=False)
+        # Check if the column headers have leading and/or trailing spaces
+        # The leading/trailing spaces should raise an error during the header validation
+        has_trailing_spaces = self.check_leading_trailing_spaces(first_row.columns.values)
+        if has_trailing_spaces:
+            self.global_errors += 1
+        return first_row.columns.values
+
+
+    def get_genomebuild(self):
+        ''' Retrieve the Genome Build from the comments '''
+        if self.valid_type == 'hm_pos':
+            self.genomebuild = self.get_comments_info('#HmPOS_build')
+        else:
+            self.genomebuild = self.get_comments_info('#Hm_genome_build')
+
+
+    def get_pgs_id(self):
+        ''' Retrieve the PGS ID from the comments '''
+        self.pgs_id = self.get_comments_info('#pgs_id')
+
+
+    def validate_content(self):
+        ''' Validate the file content and verify that the number of variant lines corresponds to the number of variants in the headers '''
+        variant_lines_count = 0
+        meta_lines_count = 0
+        
+        with gzip.open( self.file, 'rb') as f:
+            line_number = 0
+            file_meta = []
+            for line in f:
+                line_number += 1
+                line = line.decode('utf-8').rstrip()
+                # Check Metadata
+                if line.startswith('#'):
+                    self.extract_specific_metadata(line)
+                    # Check that we have all the meta information
+                    for meta in self.meta_format:
+                        if line.startswith(meta):
+                            file_meta.append(meta)
+                            meta_lines_count += 1
+                            break
+                
+                # Check data
+                else:
+                    variant_lines_count += 1
+                    if re.search('\w+', line): # Line not empty
+                        cols_content = line.split(self.sep)
+                        has_trailing_spaces = self.check_leading_trailing_spaces(cols_content,line_number)
+                        if has_trailing_spaces:
+                            self.global_errors += 1
+                        
+                        if line.startswith('rsID') or line.startswith('chr_name'):
+                            continue
+                        
+                        self.validate_line_content(cols_content,variant_lines_count)
+                    else:
+                        self.logger.error(f'- Line {line_number} is empty')
+                        self.global_errors += 1
+        
+        # Compare the number of metadata lines: read vs expected
+        if meta_lines_count != len(self.meta_format):
+            self.logger.error(f'- The number of metadata lines [i.e. starting with the "#" character] in the file ({meta_lines_count}) and the expected number of metadata lines ({len(self.meta_format)}) are different')
+            diff_list = list(set(self.meta_format).difference(file_meta))
+            self.logger.error(f"  > Missing metadata line(s): {', '.join(diff_list)}")
+            self.global_errors += 1
+
+
+    def validate_data(self):
+        ''' Validate the file: data format and data content '''
+        if not self.open_file_and_check_for_squareness():
+            self.logger.error("Please fix the table. Some rows have different numbers of columns to the header")
+            self.logger.info("Rows with different numbers of columns to the header are not validated")
+
+        # Validate data content and check the consitence between the declared variants number and the actual number of variants in the file
+        self.validate_content()
+        for chunk in self.df_iterator(self.file):
+            to_validate = chunk[self.cols_to_read]
+            to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded
+
+            # Schema validation
+            self.schema = pandas_schema.Schema([self.validators[h] for h in self.cols_to_validate])                
+            errors = self.schema.validate(to_validate)
+            self.store_errors(errors)
+
+            self.process_errors()
+            if len(self.bad_rows) >= self.error_limit:
+                break
+
+        if not self.bad_rows and not self.global_errors:
+            self.logger.info("File is valid")
+            return True
+        else:
+            self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit))
+            return False
+
+
+    def process_errors(self):
+        ''' Populate the logger error and the list of bad rows with the errors found. '''
+        for error in self.row_errors:
+            if len(self.bad_rows) < self.error_limit or self.error_limit < 1:
+                self.logger.error(error)
+                if error.row not in self.bad_rows:
+                    self.bad_rows.append(error.row)
+        self.row_errors = []
+
+
+    def store_errors(self, errors: List[pandas_schema.validation_warning.ValidationWarning]):
+        ''' Capture the errors found into a temporary structure before being processed. '''
+        for error in errors:
+            seen = 0
+            row_number = error.row
+            file_line_number = row_number + self.comment_lines_count + 1 # rows are 0 indexes
+            error.row = str(row_number) + " (line "+str(file_line_number)+")"
+            col = error.column
+            # Avoid duplication as the errors can be detected several times
+            if row_number in self.errors_seen.keys():
+                if col in self.errors_seen[row_number].keys():
+                    seen = 1
+                else:
+                    self.errors_seen[row_number][col] = 1
+            else:
+                self.errors_seen[row_number] = { col : 1 }
+            if seen == 0:
+                self.row_errors.append(error)
+
+
+    def validate_file_extension(self):
+        ''' Check/validate the file name extension. '''
+        check_exts = [self.check_ext(ext) for ext in self.valid_extensions]
+        if not any(check_exts):
+            self.valid_ext = False
+            self.logger.error("File extension should be in {}".format(self.valid_extensions))
+            return False
+        else:
+            self.valid_ext = True
+        return True
+
+
+    def compare_number_of_rows(self):
+        ''' Compare the number of data rows between the harmonized and the formatted scoring files. '''
+        # Harmonization file - length
+        hm_rows_count = 0
+        for chunk in self.df_iterator(self.file):
+            hm_rows_count += len(chunk.index)
+        gc.collect()
+
+        # Formatted scoring file - length
+        scoring_rows_count = 0
+        scoring_file = f'{self.score_dir}/{self.pgs_id}.txt.gz'
+        if os.path.isfile(scoring_file):
+            for score_chunk in self.df_iterator(scoring_file):
+                scoring_rows_count += len(score_chunk.index)
+            gc.collect()
+
+        comparison_status = True
+        if scoring_rows_count == 0:
+            self.logger.error(f"Can't find the Scoring file '{scoring_file}' to compare the number of rows with the harmonization file!")
+            comparison_status = False
+        elif hm_rows_count != scoring_rows_count:
+            self.logger.error(f'The number of data rows between the Scoring file ({scoring_rows_count}) and the Harmonization POS file ({hm_rows_count}) are different')
+            comparison_status = False
+        return comparison_status
+
+
+    def compare_with_filename(self):
+        ''' Check that the filename matches the information present in the file metadata (PGS ID, genome build). '''
+        comparison_status = True
+        if hasattr(self,'file_genomebuild') and hasattr(self,'file_pgs_id'):
+            # Extract some metadata
+            self.get_genomebuild()
+            self.get_pgs_id()
+            # Compare metadata with filename information
+            if self.file_genomebuild != self.genomebuild:
+                self.logger.error("Build: the genome build in the HmPOS_build header ({}) is different from the one on the filename ({})".format(self.genomebuild,self.file_genomebuild))
+                check_status = False
+            if self.file_pgs_id != self.pgs_id:
+                self.logger.error("ID: the PGS ID of the header ({}) is different from the one on the filename ({})".format(self.pgs_id,self.file_pgs_id))
+                check_status = False
+            # Compare number of rows with Scoring file
+            if self.score_dir:
+                row_comparison_status = self.compare_number_of_rows()
+                if row_comparison_status == False:
+                    comparison_status = row_comparison_status
+            else:
+                self.logger.info("Comparison of the number of rows between Harmonized and Scoring file skipped!")
+        return comparison_status
+
+
+    def df_iterator(self, data_file: str):
+        ''' Setup a pandas dataframe iterator. '''
+        df = pd.read_csv(data_file,
+                         sep=self.sep,
+                         dtype=str,
+                         comment='#',
+                         chunksize=1000000)
+        return df
+
+
+    def check_file_is_square(self, csv_file: str):
+        ''' Check that each row has the name number of columns. '''
+        square = True
+        csv_file.seek(0)
+        reader = csv.reader(csv_file, delimiter=self.sep)
+        count = 1
+        for row in reader:
+            if len(row) != 0:
+                if row[0].startswith('#'):
+                    self.comment_lines_count += 1
+                    continue
+            if (len(row) != len(self.header)):
+                self.logger.error("Length of row {c} is: {l} instead of {h}".format(c=count, l=str(len(row)), h=str(len(self.header))))
+                self.logger.error("ROW: "+str(row))
+                square = False
+            count += 1
+        del csv_file
+        return square
+
+
+    def open_file_and_check_for_squareness(self):
+        ''' Method to read the file in order to check that each row has the name number of columns. '''
+        if pathlib.Path(self.file).suffix in [".gz", ".gzip"]:
+             with gzip.open(self.file, 'rt') as f:
+                 return self.check_file_is_square(f)
+        else:
+            with open(self.file) as f:
+                 return self.check_file_is_square(f)
+
+
+    def check_leading_trailing_spaces(self, cols:str, line_number:str = None):
+        '''
+        Check if the columns have leading and/or trailing spaces.
+        The leading/trailing spaces should raise an error during the validation.
+        '''
+        leading_trailing_spaces = []
+        found_trailing_spaces = False
+        for idx, col in enumerate(cols):
+            if col.startswith(' ') or col.endswith(' '):
+                leading_trailing_spaces.append(self.header[idx]+' => |'+str(col)+'|')
+        if len(leading_trailing_spaces):
+            if line_number:
+                line_name = f'line {line_number} has'
+            else:
+                line_name = 'following headers have'
+            self.logger.error("The "+line_name+" leading and/or trailing spaces: "+' ; '.join(leading_trailing_spaces))
+            found_trailing_spaces = True
+        return found_trailing_spaces
+
+
+    def check_ext(self, ext:str) -> bool:
+        if self.file.endswith(ext):
+            return True
+        return False
+
+
+    def check_build_is_legit(self, build:str) -> bool:
+        if build in BUILD_LIST:
+            return True
+        return False
+
+    
+    def get_comments_info(self, type:str) -> str:
+        ''' Retrieve information from the comments '''
+        with gzip.open(self.file, 'rb') as f_in:
+            for f_line in f_in:
+                line = f_line.decode()
+                # Update header
+                if line.startswith(type):
+                    info = (line.split('='))[1]
+                    return info.strip()
+
+
+    def validate_filename(self):
+        ''' Validate the file name structure. '''
+        print("To be implemented in inherited classes")
+        pass
+
+
+    def validate_headers(self):
+        ''' Validate the list of column names. '''
+        print("To be implemented in inherited classes")
+        pass
+
+
+    def validate_line_content(self, cols_content:str, var_line_number:int):
+        ''' Validate each data row. '''
+        print("To be implemented in inherited classes")
+        pass
+
+
+    def extract_specific_metadata(self, line:str):
+        ''' Extra method to extract and validate specific data. '''
+        print("To be implemented in inherited classes")
+        pass
+

From 08409aafdf47ece789830ea26e2b88f34e5befcf Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Tue, 6 Sep 2022 11:27:46 +0100
Subject: [PATCH 02/46] Python 3.10 compatibility changes and minor updates

---
 pgscatalog_utils/validate/formatted/validator.py           | 6 +++---
 pgscatalog_utils/validate/harmonized_position/validator.py | 2 +-
 pgscatalog_utils/validate/validator_base.py                | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pgscatalog_utils/validate/formatted/validator.py b/pgscatalog_utils/validate/formatted/validator.py
index 0a5ed3b..16bd425 100644
--- a/pgscatalog_utils/validate/formatted/validator.py
+++ b/pgscatalog_utils/validate/formatted/validator.py
@@ -20,6 +20,7 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
         self.validators = FORMATTED_VALIDATORS
         self.valid_cols = VALID_COLS_FORMATTED
         self.valid_type = VALID_TYPE_FORMATTED
+        self.setup_field_validation()
 
     
     def extract_specific_metadata(self,line):
@@ -44,7 +45,7 @@ def get_and_check_variants_number(self):
                         self.variants_number = int(match_variants_number.group(1))
                 else:
                     variant_lines += 1
-                    if re.search('\w+', line): # Line not empty
+                    if re.search(r'\w+', line): # Line not empty
                         cols = line.split(self.sep)
                         has_trailing_spaces = self.check_leading_trailing_spaces(cols,line_number)
                         if has_trailing_spaces:
@@ -128,7 +129,7 @@ def validate_data(self):
 
     def validate_filename(self):
         filename = self.file.split('/')[-1].split('.')[0]
-        if re.match('^PGS\d{6}$', filename):
+        if re.match(r'^PGS\d{6}$', filename):
             return True
         else:
             self.logger.error("Filename: {} should follow the pattern 'PGSXXXXXX.txt.gz', where the 'X' are the 6 digits of the PGS identifier (e.g. PGS000001)".format(filename))
@@ -136,7 +137,6 @@ def validate_filename(self):
 
 
     def validate_headers(self):
-        self.setup_field_validation()
         self.detect_genomebuild_with_rsid()
         required_is_subset = set(STD_COLS_VAR_FORMATTED).issubset(self.header)
         if not required_is_subset:
diff --git a/pgscatalog_utils/validate/harmonized_position/validator.py b/pgscatalog_utils/validate/harmonized_position/validator.py
index c12ca58..3058da3 100644
--- a/pgscatalog_utils/validate/harmonized_position/validator.py
+++ b/pgscatalog_utils/validate/harmonized_position/validator.py
@@ -16,6 +16,7 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
         self.validators = POS_VALIDATORS
         self.valid_cols = VALID_COLS_POS
         self.valid_type = VALID_TYPE_POS
+        self.setup_field_validation()
 
 
     def extract_specific_metadata(self,line):
@@ -65,7 +66,6 @@ def validate_filename(self):
     def validate_headers(self):
         ''' Validate the list of column names. '''
         # Check if it has at least a "SNP" column or a "chromosome" column
-        self.setup_field_validation()
         required_is_subset = set(STD_COLS_VAR_POS).issubset(self.header)
         if not required_is_subset:
             self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_POS, self.header))
diff --git a/pgscatalog_utils/validate/validator_base.py b/pgscatalog_utils/validate/validator_base.py
index f76adf1..6f9bdc8 100644
--- a/pgscatalog_utils/validate/validator_base.py
+++ b/pgscatalog_utils/validate/validator_base.py
@@ -113,7 +113,7 @@ def validate_content(self):
                 # Check data
                 else:
                     variant_lines_count += 1
-                    if re.search('\w+', line): # Line not empty
+                    if re.search(r'\w+', line): # Line not empty
                         cols_content = line.split(self.sep)
                         has_trailing_spaces = self.check_leading_trailing_spaces(cols_content,line_number)
                         if has_trailing_spaces:

From e2fc20913fcb2b237df9be43e1f988c69985f18e Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Wed, 14 Sep 2022 13:22:03 +0100
Subject: [PATCH 03/46] Code updates and add tests for validation

---
 .../validate/formatted/validator.py           |  81 ++++------
 .../validate/harmonized_position/validator.py |  73 ++-------
 .../validate/validate_scorefile.py            |  85 +++++-----
 pgscatalog_utils/validate/validator_base.py   |  73 ++++++++-
 tests/data/test_scoring_file_1.txt.gz         | Bin 0 -> 1071 bytes
 tests/data/test_scoring_file_2.txt.gz         | Bin 0 -> 877 bytes
 tests/data/test_scoring_file_3.txt.gz         | Bin 0 -> 876 bytes
 tests/data/test_scoring_file_4.txt.gz         | Bin 0 -> 1076 bytes
 .../data/test_scoring_file_hmpos_37_1.txt.gz  | Bin 0 -> 1257 bytes
 .../data/test_scoring_file_hmpos_37_2.txt.gz  | Bin 0 -> 1157 bytes
 .../data/test_scoring_file_hmpos_37_3.txt.gz  | Bin 0 -> 973 bytes
 .../data/test_scoring_file_hmpos_38_1.txt.gz  | Bin 0 -> 1335 bytes
 .../data/test_scoring_file_hmpos_38_2.txt.gz  | Bin 0 -> 1163 bytes
 .../data/test_scoring_file_hmpos_38_3.txt.gz  | Bin 0 -> 975 bytes
 tests/test_validate.py                        | 148 ++++++++++++++++++
 15 files changed, 293 insertions(+), 167 deletions(-)
 create mode 100644 tests/data/test_scoring_file_1.txt.gz
 create mode 100644 tests/data/test_scoring_file_2.txt.gz
 create mode 100644 tests/data/test_scoring_file_3.txt.gz
 create mode 100644 tests/data/test_scoring_file_4.txt.gz
 create mode 100644 tests/data/test_scoring_file_hmpos_37_1.txt.gz
 create mode 100644 tests/data/test_scoring_file_hmpos_37_2.txt.gz
 create mode 100644 tests/data/test_scoring_file_hmpos_37_3.txt.gz
 create mode 100644 tests/data/test_scoring_file_hmpos_38_1.txt.gz
 create mode 100644 tests/data/test_scoring_file_hmpos_38_2.txt.gz
 create mode 100644 tests/data/test_scoring_file_hmpos_38_3.txt.gz
 create mode 100644 tests/test_validate.py

diff --git a/pgscatalog_utils/validate/formatted/validator.py b/pgscatalog_utils/validate/formatted/validator.py
index 16bd425..eda02cc 100644
--- a/pgscatalog_utils/validate/formatted/validator.py
+++ b/pgscatalog_utils/validate/formatted/validator.py
@@ -81,7 +81,9 @@ def detect_duplicated_rows(self,dataframe_chunk):
                 self.bad_rows.append(index)
 
 
-    def validate_data(self):
+    def validate_data(self) -> bool:
+        ''' Validate the file: data format and data content '''
+        self.logger.info("Validating data...")
         if not self.open_file_and_check_for_squareness():
             self.logger.error("Please fix the table. Some rows have different numbers of columns to the header")
             self.logger.info("Rows with different numbers of columns to the header are not validated")
@@ -119,24 +121,33 @@ def validate_data(self):
             if len(self.bad_rows) >= self.error_limit:
                 break
         if not self.bad_rows and not self.global_errors:
-            self.logger.info("File is valid")
-            return True
-
+            if self.is_file_valid():
+                self.logger.info("File is valid")
+            else:
+                self.logger.info("File is invalid")
         else:
             self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit))
-            return False
+            self.set_file_is_invalid()
+        return self.is_file_valid()
 
 
-    def validate_filename(self):
+    def validate_filename(self) -> bool:
+        ''' Validate the file name structure. '''
+        self.logger.info("Validating file name...")
         filename = self.file.split('/')[-1].split('.')[0]
-        if re.match(r'^PGS\d{6}$', filename):
-            return True
-        else:
+        is_valid_filename = True
+        if not re.match(r'^PGS\d{6}$', filename):
+            self.logger.info("Invalid filename: {}".format(self.file))
             self.logger.error("Filename: {} should follow the pattern 'PGSXXXXXX.txt.gz', where the 'X' are the 6 digits of the PGS identifier (e.g. PGS000001)".format(filename))
-            return False
+            is_valid_filename = False
+            self.set_file_is_invalid()
+
+        return is_valid_filename
 
 
-    def validate_headers(self):
+    def validate_headers(self) -> bool:
+        ''' Validate the list of column names. '''
+        self.logger.info("Validating headers...")
         self.detect_genomebuild_with_rsid()
         required_is_subset = set(STD_COLS_VAR_FORMATTED).issubset(self.header)
         if not required_is_subset:
@@ -157,6 +168,10 @@ def validate_headers(self):
             self.logger.error("Required headers: at least one of the columns '{}' must be in the file header: {}".format(STD_COLS_EFFECT_FORMATTED, self.header))
             required_is_subset = None
 
+        if not required_is_subset:
+            self.logger.info("Invalid headers...exiting before any further checks")
+            self.set_file_is_invalid()
+
         return required_is_subset
 
 
@@ -185,46 +200,4 @@ def get_genomebuild(self):
 
 def init_validator(file, logfile, score_dir=None) -> ValidatorFormatted:
     validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile)
-    return validator
-
-# def run_validator(file, check_filename, logfile, score_dir=None):
-
-#     validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile)
-
-#     validator.logger.propagate = False
-
-#     if not file or not logfile:
-#         validator.logger.info("Missing file and/or logfile")
-#         validator.logger.info("Exiting before any further checks")
-#         sys.exit()
-#     if not os.path.exists(file):
-#         validator.logger.info("Error: the file '"+file+"' can't be found")
-#         validator.logger.info("Exiting before any further checks")
-#         sys.exit()
-
-#     is_ok_to_run_validation = 1
-#     validator.logger.info("Validating file extension...")
-#     if not validator.validate_file_extension():
-#         validator.logger.info("Invalid file extension: {}".format(file))
-#         validator.logger.info("Exiting before any further checks")
-#         is_ok_to_run_validation = 0
-
-#     if is_ok_to_run_validation and check_filename:
-#         validator.logger.info("Validating file name...")
-#         if not validator.validate_filename():
-#             validator.logger.info("Invalid filename: {}".format(file))
-#             is_ok_to_run_validation = 0
-
-#     if is_ok_to_run_validation:
-#         validator.logger.info("Validating headers...")
-#         if not validator.validate_headers():
-#             validator.logger.info("Invalid headers...exiting before any further checks")
-#             is_ok_to_run_validation = 0
-
-#     if is_ok_to_run_validation:
-#         validator.logger.info("Validating data...")
-#         validator.validate_data()
-
-#     # Close log handler
-#     validator.logger.removeHandler(validator.handler)
-#     validator.handler.close()
\ No newline at end of file
+    return validator
\ No newline at end of file
diff --git a/pgscatalog_utils/validate/harmonized_position/validator.py b/pgscatalog_utils/validate/harmonized_position/validator.py
index 3058da3..b46e8c4 100644
--- a/pgscatalog_utils/validate/harmonized_position/validator.py
+++ b/pgscatalog_utils/validate/harmonized_position/validator.py
@@ -43,28 +43,33 @@ def validate_line_content(self,cols_content,var_line_number):
                 self.logger.error(f"- Variant line {var_line_number} | 'hm_match_pos' should be 'True': same position ('chr_position={line_dict['chr_position']}' vs 'hm_pos={line_dict['hm_pos']}')")
 
 
-    def validate_filename(self):
+    def validate_filename(self) -> bool:
         ''' Validate the file name structure. '''
+        self.logger.info("Validating file name...")
         pgs_id, build = None, None
+        is_valid_filename = True
         # hmPOS
         filename = self.file.split('/')[-1].split('.')[0]
         filename_parts = filename.split('_hmPOS_')
         if len(filename_parts) != 2:
             self.logger.error("Filename: {} should follow the pattern <pgs_id>_hmPOS_<build>.txt.gz [build=GRChXX]".format(filename))
-            return False
+            self.set_file_is_invalid()
+            is_valid_filename = False
         else:
             pgs_id, build = filename_parts
         self.file_pgs_id = pgs_id
         self.file_genomebuild = build
         if not self.check_build_is_legit(build):
             self.logger.error("Build: {} is not an accepted build value".format(build))
-            return False
-        self.logger.info("Filename looks good!")
-        return True
+            self.set_file_is_invalid()
+            is_valid_filename = False
 
+        return is_valid_filename
 
-    def validate_headers(self):
+
+    def validate_headers(self) -> bool:
         ''' Validate the list of column names. '''
+        self.logger.info("Validating headers...")
         # Check if it has at least a "SNP" column or a "chromosome" column
         required_is_subset = set(STD_COLS_VAR_POS).issubset(self.header)
         if not required_is_subset:
@@ -78,7 +83,11 @@ def validate_headers(self):
             if not required_pos:
                 self.logger.error("One of the following required header is missing: '{}' and/or '{}' are not in the file header: {}".format(SNP_COLS_VAR_POS, CHR_COLS_VAR_POS, self.header))
                 required_is_subset = required_pos
-        
+
+        if not required_is_subset:
+            self.logger.info("Invalid headers...exiting before any further checks")
+            self.set_file_is_invalid()
+
         return required_is_subset
 
 
@@ -86,52 +95,4 @@ def validate_headers(self):
 
 def init_validator(file, logfile, score_dir=None) -> ValidatorPos:
     validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile)
-    return validator
-
-# def run_validator(file, check_filename, logfile, score_dir=None):
-
-#     validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile)
-
-#     validator.logger.propagate = False
-
-#     if not file or not logfile:
-#         validator.logger.info("Missing file and/or logfile")
-#         validator.logger.info("Exiting before any further checks")
-#         sys.exit()
-#     if not os.path.exists(file):
-#         validator.logger.info("Error: the file '"+file+"' can't be found")
-#         validator.logger.info("Exiting before any further checks")
-#         sys.exit()
-
-#     is_ok_to_run_validation = 1
-#     validator.logger.info("Validating file extension...")
-#     if not validator.validate_file_extension():
-#         validator.logger.info("Invalid file extension: {}".format(file))
-#         validator.logger.info("Exiting before any further checks")
-#         is_ok_to_run_validation = 0
-
-#     if is_ok_to_run_validation and check_filename:
-#         validator.logger.info("Validating file name...")
-#         if not validator.validate_filename():
-#             validator.logger.info("Invalid filename: {}".format(file))
-#             is_ok_to_run_validation = 0
-
-#     if is_ok_to_run_validation:
-#         validator.logger.info("Comparing filename with metadata...")
-#         if not validator.compare_with_filename():
-#             validator.logger.info("Discrepancies between filename information and metadata: {}".format(file))
-#             is_ok_to_run_validation = 0
-
-#     if is_ok_to_run_validation:
-#         validator.logger.info("Validating headers...")
-#         if not validator.validate_headers():
-#             validator.logger.info("Invalid headers...exiting before any further checks")
-#             is_ok_to_run_validation = 0
-
-#     if is_ok_to_run_validation:
-#         validator.logger.info("Validating data...")
-#         validator.validate_data()
-
-#     # Close log handler
-#     validator.logger.removeHandler(validator.handler)
-#     validator.handler.close()
\ No newline at end of file
+    return validator
\ No newline at end of file
diff --git a/pgscatalog_utils/validate/validate_scorefile.py b/pgscatalog_utils/validate/validate_scorefile.py
index f31ef88..3e38bf4 100644
--- a/pgscatalog_utils/validate/validate_scorefile.py
+++ b/pgscatalog_utils/validate/validate_scorefile.py
@@ -38,55 +38,42 @@ def _file_validation_state(filename: str, log_file: str) -> None:
 
 def _run_validator(validator: object, file: str, check_filename: bool, logfile: str, validator_type: str) -> None:
     ''' Main method to run the PGS file validator '''
-    validator.logger.propagate = False
-
-    is_ok_to_continue_validation = 1
-
-    # Check files exist
-    if not file or not logfile:
-        validator.logger.info("Missing file and/or logfile")
-        is_ok_to_continue_validation = 0
-    elif file and not os.path.exists(file):
-        validator.logger.info("Error: the file '"+file+"' can't be found")
-        is_ok_to_continue_validation = 0
-
-    # Validate file extension
-    validator.logger.info("Validating file extension...")
-    if not validator.validate_file_extension():
-        validator.logger.info("Invalid file extension: {}".format(file))
-        is_ok_to_continue_validation = 0
-    # Validate file name nomenclature
-    if is_ok_to_continue_validation and check_filename:
-        validator.logger.info("Validating file name...")
-        if not validator.validate_filename():
-            validator.logger.info("Invalid filename: {}".format(file))
-            is_ok_to_continue_validation = 0
-
-    # Only for harmonized files
-    if is_ok_to_continue_validation and validator_type != 'formatted':
-        validator.logger.info("Comparing filename with metadata...")
-        if not validator.compare_with_filename():
-            validator.logger.info("Discrepancies between filename information and metadata: {}".format(file))
-            is_ok_to_continue_validation = 0
-
-    # Validate column headers
-    if is_ok_to_continue_validation:
-        validator.logger.info("Validating headers...")
-        if not validator.validate_headers():
-            validator.logger.info("Invalid headers...exiting before any further checks")
-            is_ok_to_continue_validation = 0
-
-    # Validate data content
-    if is_ok_to_continue_validation:
-        validator.logger.info("Validating data...")
-        validator.validate_data()
-
-    if is_ok_to_continue_validation == 0:
-        validator.logger.info("Exiting before any further checks")
-
-    # Close log handler
-    validator.logger.removeHandler(validator.handler)
-    validator.handler.close()
+    if check_filename:
+        validator.run_validator()
+    else:
+        validator.run_validator_skip_check_filename()
+    # validator.logger.propagate = False
+
+    # # Check files exist
+    # if not file or not logfile:
+    #     validator.logger.info("Missing file and/or logfile")
+    #     validator.set_file_is_invalid()
+    # elif file and not os.path.exists(file):
+    #     validator.logger.info("Error: the file '"+file+"' can't be found")
+    #     validator.set_file_is_invalid()
+
+    # # Validate file extension
+    # validator.validate_file_extension()
+
+    # # Validate file name nomenclature
+    # if validator.is_file_valid() and check_filename:
+    #     validator.validate_filename()
+
+    # # Only for harmonized files
+    # if validator.is_file_valid() and validator_type != 'formatted':
+    #     validator.compare_with_filename()
+
+    # # Validate column headers
+    # if validator.is_file_valid():
+    #     validator.validate_headers()
+
+    # # Validate data content
+    # if validator.is_file_valid():
+    #     validator.validate_data()
+
+    # # Close log handler
+    # validator.logger.removeHandler(validator.handler)
+    # validator.handler.close()
 
 
 def _check_args(args):
diff --git a/pgscatalog_utils/validate/validator_base.py b/pgscatalog_utils/validate/validator_base.py
index 6f9bdc8..80af5c4 100644
--- a/pgscatalog_utils/validate/validator_base.py
+++ b/pgscatalog_utils/validate/validator_base.py
@@ -42,6 +42,7 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
         self.errors_seen = {}
         self.logfile = logfile
         self.error_limit = int(error_limit)
+        self.is_valid = True
 
         # Logging variables
         self.logger = logging.getLogger(__name__)
@@ -135,8 +136,9 @@ def validate_content(self):
             self.global_errors += 1
 
 
-    def validate_data(self):
+    def validate_data(self) -> bool:
         ''' Validate the file: data format and data content '''
+        self.logger.info("Validating data...")
         if not self.open_file_and_check_for_squareness():
             self.logger.error("Please fix the table. Some rows have different numbers of columns to the header")
             self.logger.info("Rows with different numbers of columns to the header are not validated")
@@ -156,12 +158,21 @@ def validate_data(self):
             if len(self.bad_rows) >= self.error_limit:
                 break
 
-        if not self.bad_rows and not self.global_errors:
+        if not self.bad_rows and not self.global_errors and self.is_valid:
             self.logger.info("File is valid")
-            return True
         else:
             self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit))
-            return False
+            self.set_file_is_invalid()
+        return self.is_valid
+
+
+    def is_file_valid(self) -> bool:
+        ''' Method returning the boolean value: True if the file is valid, False if the file is invalid. '''
+        return self.is_valid
+
+    def set_file_is_invalid(self):
+        ''' Set the flag "is_valid" to False. '''
+        self.is_valid = False
 
 
     def process_errors(self):
@@ -196,14 +207,16 @@ def store_errors(self, errors: List[pandas_schema.validation_warning.ValidationW
 
     def validate_file_extension(self):
         ''' Check/validate the file name extension. '''
+        self.logger.info("Validating file extension...")
         check_exts = [self.check_ext(ext) for ext in self.valid_extensions]
         if not any(check_exts):
             self.valid_ext = False
+            self.set_file_is_invalid()
+            self.logger.info("Invalid file extension: {}".format(self.file))
             self.logger.error("File extension should be in {}".format(self.valid_extensions))
-            return False
         else:
             self.valid_ext = True
-        return True
+        return self.valid_ext
 
 
     def compare_number_of_rows(self):
@@ -234,6 +247,7 @@ def compare_number_of_rows(self):
 
     def compare_with_filename(self):
         ''' Check that the filename matches the information present in the file metadata (PGS ID, genome build). '''
+        self.logger.info("Comparing filename with metadata...")
         comparison_status = True
         if hasattr(self,'file_genomebuild') and hasattr(self,'file_pgs_id'):
             # Extract some metadata
@@ -242,10 +256,10 @@ def compare_with_filename(self):
             # Compare metadata with filename information
             if self.file_genomebuild != self.genomebuild:
                 self.logger.error("Build: the genome build in the HmPOS_build header ({}) is different from the one on the filename ({})".format(self.genomebuild,self.file_genomebuild))
-                check_status = False
+                comparison_status = False
             if self.file_pgs_id != self.pgs_id:
                 self.logger.error("ID: the PGS ID of the header ({}) is different from the one on the filename ({})".format(self.pgs_id,self.file_pgs_id))
-                check_status = False
+                comparison_status = False
             # Compare number of rows with Scoring file
             if self.score_dir:
                 row_comparison_status = self.compare_number_of_rows()
@@ -253,6 +267,9 @@ def compare_with_filename(self):
                     comparison_status = row_comparison_status
             else:
                 self.logger.info("Comparison of the number of rows between Harmonized and Scoring file skipped!")
+            if not comparison_status:
+                self.logger.info("Discrepancies between filename information and metadata: {}".format(self.file))                
+                self.set_file_is_invalid()
         return comparison_status
 
 
@@ -338,6 +355,46 @@ def get_comments_info(self, type:str) -> str:
                     info = (line.split('='))[1]
                     return info.strip()
 
+    def run_generic_validator(self,check_filename):
+        self.logger.propagate = False
+
+        # Check files exist
+        if not self.file or not self.logfile:
+            self.logger.info("Missing file and/or logfile")
+            self.set_file_is_invalid()
+        elif self.file and not os.path.exists(self.file):
+            self.logger.info("Error: the file '"+self.file+"' can't be found")
+            self.set_file_is_invalid()
+
+        # Validate file extension
+        self.validate_file_extension()
+
+        # Validate file name nomenclature
+        if self.is_file_valid() and check_filename:
+            self.validate_filename()
+
+        # Only for harmonized files
+        if self.is_file_valid() and type(self).__name__ != 'ValidatorFormatted':
+            self.compare_with_filename()
+
+        # Validate column headers
+        if self.is_file_valid():
+            self.validate_headers()
+
+        # Validate data content
+        if self.is_file_valid():
+            self.validate_data()
+
+        # Close log handler
+        self.logger.removeHandler(self.handler)
+        self.handler.close()
+
+    def run_validator(self):
+        self.run_generic_validator(True)
+
+    def run_validator_skip_check_filename(self):
+        self.run_generic_validator(False)
+
 
     def validate_filename(self):
         ''' Validate the file name structure. '''
diff --git a/tests/data/test_scoring_file_1.txt.gz b/tests/data/test_scoring_file_1.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..cd46417a56e0f016489606f5d512f708d861094f
GIT binary patch
literal 1071
zcmV+~1kn2*iwFo01`=Zc19W9`bYF90Z*pmFXJ2M%Y-L|DE_8Tw0BuxFZ=*;MJ>y@2
z<jc&#q^tU)x<~RM>vb$kS$ihQ9`=MtAP$maEKDYvf4`3gJkn}?!O)LauU@@kKA(Tz
zJf^GLr`!AeCVgD(AGW(qdbhn_r;GGbRq0#ToiE>RZl>$!=Vx|4TwaQ<I5vl@X%9DL
z^LaWpMR~cIm&g3IJLi{|rmath>6*SaZCaFN-PO&hIHvUp58jLZZZ;qO=0B_Ug8L<B
zY6emJ`+qj;-F6ivt<x`1`8D0{?)DFV-9ByiyV)Ef^158&K*iti<W#&@%grC_{Kr#%
zw<4v80*<=2sJpza&Q04@<#Jb?)AA!qD~n&#@q2MT<M8yqht+GdT;J_;R0^K_Z2no*
zhqo^8ex0l3bJZ2I`Jp;B?^XW%Q6J0Y=3(`wWj6m=v~_XnpzP!QxoVf%qP@rc-w&(x
zfBQKP{d}MLd8xbT(el3NU+vPzwCd90n5Da-OE=Xi{lbdrYnGNx{f$&cO8c9OA?y7D
z2MeXd>xG@Qm+ha)%Uc`WiGR+`B@QF0USF%1E-#M9>R2UB_g1yj&3H8Q2@a!#t(Jrm
zQqFR4$trnDI1?)&xN4ycK*VI2Lg-O!0(nVplTEUajEoBu0PFRLC9k~hQC0=A4w&1-
zWX6yVYT_ajm?d&q<FqoUw@Mb2DGA;aU>Y#?Mp^BEVy(ysfoql6%zBNZkxN^n$k|>!
zL$D~06~QG+#D|fK*gEkpM3@-uDasF-C)Bs2dUlW>d(>z`h@%tW3dByPlB5EnGQfzH
zQ68X8EE%+u$uZ-MH{SLr_S&)^45Q3y%#%l9TC`Xnxtv2=Hp;gdXrcLcMI@eSKm}Gd
zT0xod;-dF1dXE#bv7!?Wfx!e@9mup|^LvsIU`Ch?Hh}C2Rw!?@^`jL;SO#$s1dlvT
zwi-fF8Bq@H2G{FCIN_N@s@@6hm8h!|)>tw<ip68Z!?;6+hoUCGENK<`f<`-#*~u=$
z0qLCP=t;sr1@TBxoDedB2pAffL|Ny#$5@5bnJAqy9lRKxTyS0omWXnsL~AgiLC)Bs
zUpHmY!0aeWgoVb?>kRMkHT0|mn-K<-X6GUmn4Dtgr-IgL<O3iqhKATULiEQ_7_I8-
z1)~F2qDLW?iuL3I+(e#6l)?JP$Xjj1u?#Wg%&`@B4GomwLlIFvaG<N95=2dlfsA6G
z`Y>1!A?GnOP-PsODWpW`mPGX^AFzHpRy06)Z?7FPx>DqUa)g*z6D!)xIS&aV3K`cX
z`rrr)9)^A8%vi=4CL0hFc+`We;8Srmt6y=t?hc9Z+^fP;AUrbe4tXzX3Lh)gD**<v
pKmfoKC+N_(xq9WXqN-Pp4+ui(qH@OG3lbDT{{qimHiJ?J002Y16;1#E

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_2.txt.gz b/tests/data/test_scoring_file_2.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..fc1c10caea4f2bb3ace893cc2ad7890a6c383590
GIT binary patch
literal 877
zcmV-z1Csn7iwFo$2NGid19W9`bYF90Z*pmFXJ2M%Y-L|EE_8Tw0Buvtj^aoVJe#ku
z<jc%~BC}puH4+Dwr4dpu%yhGdJz4H{L0bZhjC;`h{iYwqNUP;yR|Sz7kr~di?ECsP
zT`u;E=iNGeUG82s+jaW1d0wT{^cF(;(YDvyx7jQ{@Atd79&hhOTb$~nsGH-gtnZgo
zU6i+3R-W=hd(Cg}byHoA@tGd#CN0XcYODHEoYLyT7w4ioO|s!N{~4MapXXA*ghlP1
z|5>lLn`JL)m40EBU(?O@Y4`HiV!zpKCmD;#t8&f&^uOWeQk=tl{l_Z*vCp5D2!QAr
zYMY{J^Cn#DrVZtMTU^ufvzJyDzao4tu2%*x{|kmgJzqWTa_;0Ex=D5q)$yav+h5l(
ze+z9f$&TStpF{rkS)I!H`epf{Xp;RbnyR?8tnBmr7Mi&-Z0~jV_seqiUq9Eu&-du(
zy=waz&7X^o+NQ5*Xw%{((rwYE>u^cGNRaqdq-9-wLl6kc&2Acvb5jPU0OZuvOq$!~
z&m<fU;l0g^(<z)nQnw$WiH8vy9Pz(3kTi;tWwKACAVc2UFa^yJO4Jg)PZr5KnUX+l
zJ$eSaE@aSwQ;aMUEwo1EA@WdrDamuiWimwp<eftX6@$`gFv>D-nHXW5;){`|j0UuZ
z`kKrmAaZ`>p){799mIIy&?+#a7HL@w>+C!*&!e;yiz0YlqVL44`=VZ1iVn5aPCGLg
zL}3(9RgMi6DKgIYVvGQmCe81foHmKQ2^89q?dM|qxp)a^x|(d>GEsrlzbDEqv|-|&
zO@PzNxX}b6M~!SWuaV2rh*_5GE%vm|gK|Ld@L;q-Jq+6mekF+h3@Y$8+SAhWrsF&v
zl4n{sGu{J_UE|0iFxJUl2nyvLX&Lofr@SP|EEXw+G`t5JH>)KRhh|D>%VuL(BoODw
z_cUf%kk-YL#wy-PKT6gpJ?P$ASJ7+bjeh87&%Cb@(;Rm*#Fm9vX~;g-qqqam!!~(>
z7$e>{XKvUIchgqtxNt;Pj0Am6&6soMp;GdA09{|x2y2~D5#s<W?T0Tx3T-4p9I_|J
zDbmp#e_e!+d?9HkVayjgM-W#kfiu#rrk|u`#romRNvB}YAvx&;%J<N}o1ot7-UR>v
DlbF69

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_3.txt.gz b/tests/data/test_scoring_file_3.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..6a2fef35d5b7605e6d95e12e976947db0f44bb4e
GIT binary patch
literal 876
zcmV-y1C#t8iwFoh2ohre19W9`bYF90Z*pmFXJ2M%Y-L|FE_8Tw0Buvjj^a2FJ%_J|
z#AW7yvCA%(JrV~N(Fmy*X1m$Lo}x4{5D7>W($kv1-@?QkR?7v#<Gp(CRWTlqf3Dv1
z#e6q^-LCTY#rAEzS>-S5*JVD*Zz1HTzHe_orqlHP_4Os%<85E|<+(YEraMln=IeTH
z%IY>9SLfo;x5aJWboKR^-t$A#<z-dXecfEkb6#Khi%WTb8jT+xi(jF;@$(FX9I>hG
z>p!dIX1$1(miae!`8{86Ubb(4&UfqWW;A9KMP1Dpkn#2S=2~9DZ1u;o_-j|ZER>W=
zd4~F~tox!1ZPWFknr+H9uRf!-s{EbcOWC#zUjG*ihi0~X*%my>JLN{>uTURPebN7J
z!|WsU<!F2i*X9z6kI(vC%~o%Vlcv%5SJ~C&wP$CamygiRG;+N6?cZ;U<$v?Ek32t9
zo_*cNj%Kgro!aN`dFb=<Eb>j+=c{ndzkvkxUF20$|4>pWsonHu6goFyU?L?sJ;Blb
z)Wu@5`-ir<#XV%<a0vUpD9`6`4q4NmLYE!}=n-+GG9@j7x7i}wWgrO2dmARw+#xVt
zIm+hQDw~i{#(L!$!#jr3E3}NvGb4;q+RH>kW3@D_x5y?+Nb=4p#x!G~r81<Lw#ZPR
z)8NcN1D;J1q9W5s5;>n}L`FLmop|9C>*HWVOQ%Ue^v^V}z*3Ub3Z#*akyVuR+ENO~
zlQn{Sgi``FHw?o#Z7uu4XaxmJD8NLs)`P?tKndf>#!7622)qa8aBj;qE#c|J5_<GZ
z+i?Ktj7FCNSkB{A%54;j(tz>4IUhyZJ%DzSQq2a)akY<{y+H0&rf^1EW?~9wz@w%i
z9Ao6+Q5dhlA{muvD0z)0T2VrKM+OpPl_ag1#V$brvT2(XiGb8_CtC1N;dv*FX|Oi(
zm}VtdH%RgYX`UUcWRt*?W4OB{$3_V{1>n@yxF=@F0zEbk!UB>_>p|^2c=ZfUo@6>A
zWD=i8+z*d47ED?=I?t!5qoiTTIZQ6C;*Fq*iYTmgJmLvCBmLuIK^O$CXb|L90qB|S
ze%dMx;|tQ#({NqNIg)9omLziNOo~lfgM5Yq#{Ugov5iK6@0@s75&9Ri=dFC{1polF
CFTP*^

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_4.txt.gz b/tests/data/test_scoring_file_4.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..7e57cfe06391dc2d1ce5f6f7ae5c703c0a6c29b0
GIT binary patch
literal 1076
zcmV-41k3v$iwFqg2ohre19W9`bYF90Z*pmFXJ2M%Y-L|GE_8Tw0BuxFZ=*;MJ>y@2
z<jc&#q^tU)x<~RM>vb$kS$j6g9`=MtAP$maEKDXl|9&0~c%{|)f}tO;UcGw7d_Moa
zc}!QgPq+8`P5QXnKWulK^lp2<P8aE=s?xWvJ72!t+)UTc&(G|9xV#iyacmA*(;jZh
z=JRxHit=(ZFOT_acg`;_O<SK1(=~l<+O#Okx~rR0aZKwI9=sR*-E2Pm&3{zw1@}wN
z)C{8b_kVBJyX`7UTBl#2@@u-?-R&R#xP990ce6P}<aN2kfr`K3$*Fj+mYd(!`JYeu
z-HMbV3OMT8qVDpxIyY@smCIdmPRoxdtt@^{$M41YjKkCa9#*f-a(%bYQ7L%xv-xLL
zAKtpW`+2UG&sA5<=7;LkyjS`2M|~`pn}^k#mf8GA(bmPOgR+nJ=c-+5i}oJ(e?6?$
z|Lx~I^z(h{=cVqVN6Y)7f3-^=)2d60W0vlUF5Oh8^b0GduUT3)^*2%(DeZ4AhOGAs
z94wR)uNQXKUbep{FK=yhC;mA%mpF{1dVQ^4y1Y0Zt7DZk-CNa8H{;RJCpe4}wptQO
zNIA>FC9C8q;Y_TA;Hrf(01=a63ZX}_3FIZYO*Y9wGBPeu0Ib&|mb~)1M_CofI$&-S
zlNm!gsELbAV3x>bjnm4Y-YQv8rX+YzfN8+k8)dZvinSsm1g=$LGwU^uMlNlQB4>N`
z48fu}Rs@$Q5g$e_V(Y}a5Mg4przk&Uo>1S8>e)el>`|i$A&yRfD-b&wO8!$4_x)*N
zWt7MHCRPjD$z+ys#v5;Y6nkyi4_;AbHAcy!CM{a5kJX$*M>Z<A8E7GQ@(M00%zQuv
zRyMjonepPH?JnAm6SA?Q6AppF1V<gnv|{t`5Fx<We4xWyHh}I4J}7Ur^`j3&*aUGA
z1cN+Ho*F_~8Bq=m2iH>}nJ`QuRquuNO4QX0Yb==_#o{sFVb&qTLs64smb40eL8BeW
z>|~hXfM`xb^dw=Rf_OwIjt7Cj{0)svqO9}WW2{2tOfNl!Upja(JQ?A<4lEJoNQu^<
zLWi8OMZ<1Npn=~}l?V%sk=GgC;cMhs3HBlkD9z4ADlj?4&QArc(})N_SPKocaf)b=
zAu)Q@*9%4mEJTk&C>3%x+5kHdsS#zc{4wxW8*wZ{NI7$C#a(j)CHPE4ln)%}>Zk-!
z(^?>=*rq-<7DPySj15#7$7aeY5xON&J<10xpN<s`P~O{Xhm5Wid7vC2C)UJ@HgnEH
z!iYl5wTV7B!g_~cTRAh9F^0(o<OCk|AS?J#T+QkioUZFbVm$Y%uop;=jO#<*i<;8M
uO7%*BK`syg@Wcr^^lh$Qd2FfbmE-e)RJy2~vGsxoMbIy+3e!4J2LJ%V)(Wry

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_hmpos_37_1.txt.gz b/tests/data/test_scoring_file_hmpos_37_1.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..64ca88d5c9d030529ccfb93d635e2d8f6866eebd
GIT binary patch
literal 1257
zcmV<F1Qz=riwFo_1{Y%h19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUokFpcys`DR?BYU
zNEBV0udu?VTN+5cACXwlG@+3QWCF98*;o#)fF;1lb~<Xc=HKU5IRuw0lSPc{98TTG
zsay2>{m0olo{Tr+#cCF>C#$FVau(mu7t?qUSH{Hqx;|HTx3`z)+uPglysNfpogRwa
zuqb!8S@Cu{6lqr7_OnCsQlFD*Ta@`}cX`GyMH#1Ame+Z4N)K^<f`wyh-+KM#WAf9K
z6}*qg5cMFa)#C5jbUB~6kf!k$i1KSZU*4~t9>$ybYT4^U5J{em06^}qSvjRgGn##y
zCVy^{`w2z}V+~MUrg@!|=3JDu$wtfc9A~dCv@HE~fsg6=4B+Wg(7Y6*>HR8!owUNL
z*MBqlZeJ($ud^9Fn>y|Fcji<aP4fJjAF|QxX|kuX*Z-N8d3vfLve)CYDM!?%w_g4A
zG?{kuIXC${Uh>)IwX4x+k=n1;@j5njoF0bpGOgp8ImKTHBKYet&WijFqaj9A-Bt{9
zr3L^D5R!@xIBfW1{PeI|&i`?H#owo!@qDrF^?w{6SL;r|&03b$W(4YFfaHLv%QAFo
zyHB?Ja`gLVQ@)y;yO^Rs;+y;QP}vvsug^Re)&DcgYW_U}E!Q;n0qD*nk$HJB+d4@P
z2Xin{QSVK8d3lGL@<;n)QWdY|*4R%@5S}1qAE0I*;5JjPti-rcfVGc2t}nobh!~QD
zQyJk1Bf*Il(InbLXb24<l1mK>)8%^l@NE%&MviQ%?jLAjf*HYDTbwYBm5j#GEE>om
zW<q0a28g%RKB0SZl4FGwwJ=8-a{*f8CN#zvrn;pSa{b6K(}YO36=#AX21!n$ffd6@
zYNc>Hu#oMk5ycRv0$@TBW)eeSlO{0E6$Q8@7Lq+J${E6f+nU;XOGI3jO=J>DsoH6N
z!tQi{3#o*5xMUR4vK`Pvg&_?dG$3I6c?zhi>koz6N=%$LTxbYuOoa3j4cLBYNW&m$
za7++G3bbYyHL0nPZL57k_vDyhiYabkq9`Y-v6?Fzr!W{u+qVJP?}4O}E4P&p&@X;9
z8i$>sOzjo~<ZhZUgb8kNO%QkHz(b~_a7Un%WtZJK0<ea;i(E*eHG#n1eTxKQ6cOf~
zf`HyVpimn2j&sd51h)ZuAjK$G;rZ;cJBJGH1t}a1{!+McE~JVfS=m(!*q)ex_G@7a
z0plDDBwb_|cTk07ODiOIyulgA%)&$}INjI7jx+$$>p{r&#5kr%+prlGuo-Xpj1g_8
z;Z-W&`t?#^^JXKcCrp^@sO20EHS=arK=zG+xKPBR98nDe>cg=x7nNjf_Xk`*Dxs(*
z(%}MTA*B%3`v4+@L#sT)hHQUiFz~RefRsey_X~gyflPu6ARzlvNW$fy8<-LyTwKk~
zL<Ay2wWR{GpClI)Mhw7kJ5XEc*~j=NP?B~NL#`*K8P#wpSQsWMBD7@)1kPgZutKtL
zRQS2#hB>E93L;zcl?cM6XjdyB`x!#g@bkyPN)wISK9yRc_B9fc{jgxF!Cn@IePGbL
zq0<zL3F4V6<aTBR^d0Uoiz^LxBfXlQ5vCMZ-i!#?o%sdrKv)4TFiaWFT$ard5<)>X
T-uw#KEw%pu^t^ev3Jd@MqV`|+

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_hmpos_37_2.txt.gz b/tests/data/test_scoring_file_hmpos_37_2.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8acaa73155ffa989b728af49308825bbcaeb26ab
GIT binary patch
literal 1157
zcmV;01bX`)iwFqQ5Eo+r19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUo$sfGA?v@bO3c#
zO>g2z5Ir})q7^PP(gOYc5y=OJAq*0MY+w(&7t4b?fCL!X&P1ct{`al!wlj9L%f%nh
zudC|St714DJ}tI*Hr-8En+4v^HqXoT0zWKQb3DS8bNE=-x9WZ}X|Hc@Z{ypc+Gllk
zE)L_OJWTT9?RqY<yqXO2bNW)>(rRCnr|Y4;;+LYtS)QNjQ*q7C_;iIAm&`v6hVeH2
z;mQi06E-FTNNThCb1`2pXCb9I{sdWm#>@4?=J|2DTW;2aAtaHW@&o`9e(}vUySQZW
zWuE@NOCM&G5K1jTb(x*&v~;(ktX-b0vm4G|Luz^UvxP6&?FQiWKSB3WB=d(&3X!a#
zW-xqnr^B&M>z_B5e0Oy=7#`fUxVZHD>*<^)i|5&q>%s6xR-UqJ4Vk@OzPmEvzP#<`
zkLTI^V?DQ6&r@5^{!|Bzl2zuZHQr)Z<Lo@f>#W8Lcg3F=Vf6DD=f&xsk})OROe#U8
zF(Uv*glNM@5(0jkK0j{O%fCV_{B^#YE?3*Z@Z05Sv;B~8e3xgnOW<@iB6`G3`x-j6
zKc@R*nf!gXD_`B+J?7*$eD{!@EB`?L>3?}Es{jA8td?JqdwFsDI?c{!cXsVl)D|5t
zX;r+Id*|=M*Kmd4{sJfa3s}jOoBHU=X*dB6Wz=Y;4B8<wW|V8g(0sj}KYm%EU!X#q
zDFQ+XGy{(1Qd7y$42?iWNvWV6Qi}$X5>^qQaAK_yB7`AUQldMI76b+r!dR_5gEDIs
zL({;ZN|-LA0euXvEjJu38je;xMuBWO`@r#_=SWK|lfa-vaV{l98F`9gaQ}<aqM;O~
zH5FblBMm1}F(NgH)>W(l14DafhQzd167r0S5n(739a4)1l7Y;O&rDmvpc^5K(6*mg
z14fONWGXg>Y02Q|CX5!UD_V;NlC(w}H1+imC4_G35!{$Arv-t-iDnQi1Y$&3>&pQO
z*GkA9C15@Zg~n<dieVItLK>aHA(r>bY2auyCOBz~ri_BK77a`BUOf%!ZDR<Ng2wK^
zI}O!%3LFNy_SzFrA4eOigVU&140eGJ<0;^0)61<vfny{Wg4rM&1=ku&Y1EAgMpc*9
zq8tguxMYNeil_wPq0mSI^LDdqL5`^L7U9zS%^FFqdanrF)t#Ld4IF?JGqFRyK+a-|
zpzD2?HlRW<@m6!9yyKuUn6o%=%v-L*XfUsv)!=a*gHXj3Bpic)+P&QoF^2qEut>aa
z6F&@!SOS9{HK4Yqc8f9a0yR=g$-<Os5Js!U*x-KE)2)Rm0w*a?l4uvU4ikPgG%QYJ
zZ|ek5BG6h0STUig{!dVoXa%Y%p}nRyC`VQ*sh~k#1O@H2_B}V!+ifi>5ZFozSSKc=
z2H#o9-Zu}0HOqRgYhX~&6D2o_M;e1{+YzTyk^UexFem|Q1{NbM;$7SfN|MG7;4>=v
Xiy?qvD!@%J<kNov+#S*Y><9n=f1V=K

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_hmpos_37_3.txt.gz b/tests/data/test_scoring_file_hmpos_37_3.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..601865aa1206ab2ba7bfdcf55f289c77e2cb01b4
GIT binary patch
literal 973
zcmV;;12X&{iwFo_1{Y%h19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUo$Rrcys`Ll}(T0
zI1q-<;a6Cl%Z#)be>-j|2bQ53BnEncJ?vhhG%*k<5R`;LqtX8NwVkF%?dhZs?bv=F
zSCy;EgkktR+knY<J6^12U^7{*=gS#*nlGkc2x_gtv1u;#<7jjrZ?_w~9O_-#q(ymv
zWpx;3<?URQX;zQItVs6FC8>91m7fpy5$wwfq*<0Xd3jC?ke|)MDYdgf*nTG8byb^r
zOduK<r&f!9X4B<-;#``7PsYp7V7`1>t-p=8^VM<?8YhxGi_Hddf9=XSJ?VJ%=QR0i
zn><Z0LKrKv)l_NTB$d9DRim?bnO;D4b*^RUkNfs1y<E)Z`NK`UFXQRcDlwK+grh-t
z)A`}pB+ZYDj$d?>4#I;zmnWUPT=OD}XY0w4%0c)&t@8BT7|*V!7hT2FhPPS$y`D_{
zcrI-`&-ZwCdE@RVUZfUnzy|0Bqy+@av;i}H2A>Eb_%j4qnLlC#F{06^W|&9O&}<A5
zk`WzpWAN*E{cW|J|LbglFVpRKzSs=HucznL#&ft`%hE>2=5#Vda!8{4vdPr$nCy;K
z{PSU3UG>8wpy&_q@RSy{ouL2TD=%f;tu)8Dd<FKyC|!93dcW7ZCP|Az7dj}Lqpt3g
z*RA&EgX1Zw%WJjM_N%JrU+id(vV%Fz4yMp_wX#Qy+Y`*%TL`}<Mm@lR<XCbMOoDBI
zA;XyJAris(+8oPjcrTqGcoPOTEcegBgbNYOf^onh;#?Y7ehfRI4@-g#ItvJlq`*C}
z75Z?bm~)#!$|dQ6t<ZxhL)^gQKtO_&>V}=rhdFEA=1@v^VO_8ldN9Kgaj<}#q8_)M
z@Hbf9VbhI0x9x;JOer(hH3<~tO!Qf{LLcUcx;$fu73KzxI$$gGVI^A2=9#0s$FS9T
ztPqH~kQo#a>&cW8dax7>Ij<wA5bCk)ggz`Od+iI(73=N`C-h-Og)1-&1(Q(^Y=s^i
zVa8t5ESc;{vlIHTAaCt#_PZ@R;d@vhg^9})hnUFjj%<Y<j4_kWGKE-T8?*Wtc0wPP
z^lq6j(%n?8@HdzfXW6u~Qr)H93O$$**_O6|iuD+FI-g}x`;KtPg=9Uj75cD1*m+GL
zqq4g?TcHP=4#x&{H6oShY{*vV!2(%R?y?!dq#JfZ4~__@4mM8#;@!U&t<Z-Vi`->X
v$jmF$9kUa9uu|gw2niZR-LMnBhfP5e`dVNL^`xp5{(}Dp78<PTEeZeteiP#P

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_hmpos_38_1.txt.gz b/tests/data/test_scoring_file_hmpos_38_1.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..2c6ce5db2a5ccbf48a5662afa445ff73e8370e55
GIT binary patch
literal 1335
zcmV-71<3jziwFo_1{Y%h19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUotLqcys`TRZVZ=
zNDw`nUttZG8EGM1)gRS85(kDM3{nEKfj#VAEDtyV2{1CwM5ERI_pNS2+$1Y47vt*Z
z=T}v)%H3}F`(%-hhRfmIe3C9k^M~ndlHN}5#%V7-6-BzK>*MM3&CTWd^77I@KAl#1
zo$t!0epx-;tjm|fuFTh`o9=p-J=VwUv?{CZ;puWsAImDu*XwP)Ef4uF-5xlw&;4De
z+x*Rb7S)OG2iO-K7B#>7b26SyM?um!{m3dmrPJB%{NaAMoX%&RE{n*v>j58#@M|K6
zd|wPEU&h(*%j|XpLI5~Es;hikXH{`5tGZYZX8AE)KL=^+{MY4ppC6BWc=*r5;;|fz
zZ|51m$yu<S?n|+K+SFP7>sSna6m{O|J{5;@Ut~X?x4ZRV@-W&+>U4kR)iyuWtn7LJ
zqo@Ybr?;5@@h}>{&F9$U^Zk;~YFmdI4eoOPXq_(7qE7Q&Kb_@uIw=n6M-&L3`supd
zeg@G8A??jcfws2C2R$Lk%AV#Iej7gA&u7!W!n@M1<K=LAx9D`g?Z3|#v0!_Nt@FAV
zFgx}H_0V0y+^N+jTWzYr-yfFMbMfJGdU)RLeoH^x=DXA7_MdC`u{`|`ygE(4Cd?Yz
z7yj@w!7WMg_*ks!EZ^;lU6GXarl>AA=cA?($!4FO%I9iT_<w<ruT14H%;qobyP}$V
zieX^j_bNfCB_)u6FdCgnM#(Y}eQp4fCdYyCY%#w7a+h3z0i6SrFasbc1m}Ddl><x3
zFqtGh>Vq;4T+tIc;)vlAq!r{zNP`tt`k)pLv|)x}BSR?#(nSn0mLyT50+d6fK&zFJ
zLU9_Sq~|F>kh2zI5fR#eaHu>7S|zkJ97HKV5wM6+BSLE{`8cA8!6bx4&mgrDV6?B0
zuO0~uc{D0W5ZO497;P1Rq^6$QkTouFkdiFi*VFYaa9a9ZIa{>VWSgF#;TnY+ORf|F
zhOPz~0vw>Bt5jMv7y@O34B>dt&ruXgSZ0`9ILJvuu{B~i@))X6QDu}>R)?UK8f{~Z
zP!27@S1oc^3zo(>6Bs0SSld8r3|C3F#?UN^(~k?<f=MwN&WZ3EL}(}+3LV-CePzi6
zd%nrz6-^qAg2tZw9O$6U5L+Kv#MxcK2qguoD~Gml(ng^@4+^KO&LrAt$3m9eSYega
z_Whi^q8TimZfwrGmrS7GUR!AV#mR8nh~9FvXyP72@R#AnIpY)LMalM{pyJZkFpw=n
z1Qw37q^7G{X;40hWO=sQQ4@|2=&GVIn$^fq3k^~@FHA|pDs<!vh0gDomLS6Vsch<C
z*@6uz*OF(sU0Yi7NU5u7kHGvzFkw_lXk*dGw2+S1#jCG)+gMVuGh3`u7i)9zr9%o)
zULhnUOE`}@qUPpY*(yS7Eey#pri$GdG9yBQ^Wv0!Cd7A(GXnDf>(B`l#H-Xb3=)kA
z8k+5mMxe1VK?DXK(q?aJBYBuP$a`1%x=mi!CURNgM#Na@{pSgsVrwd|i_N43%BYos
zouFa~5g3(nCR9Q3M>;G)EfpEISi7sxpl0XBZh8fy<<V+3CTGr{`oLfvI*8pwjzqk4
zV=y|DAqieQyx{$;X%@f>GK_aq1YRUpW{9wU^=fS9k~_v5-bbzDZ-s0PsZd#~?bTdH
tXlb#j10a`QfaRzGo@cE>9*wDuv7Fr1erVCcvS-<q{{m_y`>oCj002s9l0*Oi

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_hmpos_38_2.txt.gz b/tests/data/test_scoring_file_hmpos_38_2.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8c1ec1853483ed1897a38b47b5afcb5559ba0f7f
GIT binary patch
literal 1163
zcmV;61a$i!iwFqQ5Eo+r19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUo$vgGA?v@bO41_
zO>g2z5Ivh;VTH?#v_MyXcO(uBLl`6inZO=)FP4KlfMhT-W}?w*|NB;V+lf6IX}S30
z`E^yjdR6rM{ipdRo=&!t<$4}(rt9a$Y92o<ma}*eU#*P~Rdu=Ek4DY)?d@%N*<W{Q
zmFC5MSd{xwR=l0_BF(O&ewHV%)g`&^it>2gH`n;JDC0ECj@7X^r+Iul<BL=3o_hUo
zoBXil6`#j+NP3vmdimFUwpvVmO0)PAX8Ac@tRB|SkCW|Uz3TNbiR74#5g`5--kj5u
z9nZhalJDE(VG4u*7=)@aJyuC+FGX3|Y`jV@adz{mW$DibKBbong6H3Y_O%$#9@Ys)
zG8%NR|7MT-LzPrNFLwN5tF+hO+jDWU$;<7SXXE+v^uScF|06Ar>AAwpZl@Ppj+rZO
zv;OmWI{USrOQ`3msb_bryhh_?>Znz`iES09`7mCkRXn%n_!A`*J`dxpINk#p0%3Y|
z<sh^kATS_AX*LiT@Z041alKmn?PJAXXWPkQx#{)4ou1a4cEY-Om!*{*<LPuj)PS1i
zHFj!uNOp&E{LkICyxF_^_<YOrKjOQGG`}{F|8_sW6xaXxyu2>HM)vj9?y4lsbDP`d
zDX1JBPRX^nl{@S1{MUHJG~5M`cNa93E!Wk-mJ@#h9!fzgDRs1s$dCe4nnttLX7>1H
z8GV2<VpK8>L`75N7$y`58cm}C3Mv3Nv`7u=Nk)NYNMXbn&AAUljBqD?Fd7gUz%{zj
zI0jH-B#kDX0g~%BqXu0J4j_q)<~2u27NTg)8U2OhK%W<Nk&ppIGA0B@890ihc>h6Z
zP)`xYpjlBAEi@xRF(MR-)>f<r9YditDD#OaB?VI^C`PbG%3Guc^(0`ZsLM<lPO%$4
z3|FR`SPce^wc;i;hAKhv=sJwzvMpMJ1`-7V(Ztn5B<HHGhcm6)oCZV=1H!oufhgg|
zxN?xfl;omA@tBJOoZ@`=Vklr)q}CZ7Vpyk~8V*L|C`WBHsU@B@uUY89j?n_@Y@->+
zG(vaqox*Ayg-U|DcG}}n7e@m#-f5sD#jW7NI12h%cXF#y<RDwR+?;HHuGI!0=temO
z*=98;i?{?d4hh165V#siO=6U_vui*W0h*UcjOIkp0*rjG2;J45od)$BhF;(#bO^1a
z84WGMu6JEpgEGR2mv{hkj^j8}ScD%q>MYk{)R?mY0pgPMK|oUJ?gNF&&h7{pjhuXf
z6P(e>$PI%ehT!-G4Zw8NZZL{oU=-&Rm%N)pg~BLV8ynp(JGwP6MCc@iL#su!^;?H>
z&1(CAVnlSdjt51Aj^~;x?wjiV4iyPj0Oe++-P9UokrCKaVVrDeSV4_>&y93;TZ8fl
zoRe@Km`@GAGlITv9>6tAJFcr?R1o(YjyoO`XgF288F3JjbO))1Q3;%70caN=4rgBq
dQriK22E4l%Jm~k2Kp&ZC@*m<!I6L+T001@#EPVg~

literal 0
HcmV?d00001

diff --git a/tests/data/test_scoring_file_hmpos_38_3.txt.gz b/tests/data/test_scoring_file_hmpos_38_3.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..343e1b6c21cddeb9e356dcc502d191e9793ea4ff
GIT binary patch
literal 975
zcmV;=12Fs_iwFo_1{Y%h19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUo<Xscys`Tl*?}8
zHWY@}@>3A;W(KGk-Y)^Pa6OL4K*mmEchPPH#iDHivSmo}1cO1}y+@=~v<ZP~Ly`RS
z^IU#vJRUzUHfT28PM_8bw3)46ma7GNTt3aw1hvkgW7l2Uhx_|se0zJtmqWYDyS%Co
zxNZ*jMg4ZJ>bz+0$3>OxyGz#Y>ZUv&h7s-S2IWOjc4d9eD^#9g;gt8Y(Kx(j-(Azf
zJY|@SAgJ}zAB*{FIrAaS(I<%VGg_`5*Dv3u+vR#S8bc6SS){N*{1aBr`N^e=U+39x
z+w5^h38B=&R@dZZmo@HEH=QfeRenLm)rVH(KZfm7e!0NrIlAfgbvl1sXK<1=)QrY&
zt~?yOtow0s>8tDV(fHub^~q(g*Rm?o#mnr-^=SM(Z_51KL1fp{t7}r;r?*-E{xX}t
z&*u{Ic^>lFm7T9q`jq#dc4&iKhw=)eRo<b6JEKpGF!~vzqAnjO!IW@w-wG;?nZU+`
z5N-HG!hv6>FW=Uy<)8j6^ku%CE}u4|@z>Mydh_pq&9GMFol8NTO^BW_J1j$|cE@aY
zY|@{1+ve)-9?<z(RllIS$GmEX>0kfFFLnDrI0*XACH)KF-YALN@7=D;@~U!`OX}|E
znqhL=3Oz`Ur>w27&Cd02O}qTkk08|#pw<syIoGUviK$l@(L_tCrAlVWHlbKh%H4#h
zq$>Aa4Q5Odni#6MRvu+isbrB%6N!nG8c_WX)zsWj%_xA2gkhr<jiFr7fEp`&00MKV
zSqzm_c$5nw0i7laGh%H7HH684N)ZB(SZklw2b8K1fS`tXRAI?UEC2(tH`ER&ti~!}
z485V83rK2~SS*E#s#+P+5|VIVSBj~n5>WFFwUqIIS`}2`7r|vj)JhXXl_J82DzGvl
zQY%Io3#e9t`11|6ghW&sV~PY6+IDk`lvaFn3uFA2m9|{@W>Kt!HZfGn;1)xLxRq54
z9Vshg5Mw|U`^T>ESwxjH;Z+rZs`*+;Oqq^&5YFemS&CAjy(-7OhXJ+kQLS`9H6K)&
zU=d?6#pO*E%<WaduGU5@0H_WqBp589uoY2JV|gHj33otZskDfoni3gMMW{cYVZn8*
zubMG`2NW2a0_Z!St@>y|2F4Dkf<G!8-xolDhd!d5C>Cl3W#b+NV-p$qw{T{3NXrOe
xd;vHXa9=*g%0bq6)LJ!M7sZW<ph6n23aw=P)+|e6#o|f`{x`fl@%;e^005)h;okrN

literal 0
HcmV?d00001

diff --git a/tests/test_validate.py b/tests/test_validate.py
new file mode 100644
index 0000000..e00448e
--- /dev/null
+++ b/tests/test_validate.py
@@ -0,0 +1,148 @@
+import pytest
+import numpy as np
+
+from pgscatalog_utils.validate.formatted.validator import init_validator as formatted_init_validator
+from pgscatalog_utils.validate.harmonized_position.validator import init_validator as hmpos_init_validator
+
+
+log_file = 'VALIDATE.log'
+
+
+###### Formatted scoring files ######
+def _get_formatted_validator(test_file):
+    validator = formatted_init_validator(test_file,log_file,None)
+    return validator
+
+def _valid_file(test_file):
+    validator = _get_formatted_validator(test_file)
+    assert validator.validate_file_extension()
+    assert validator.validate_headers()
+    assert validator.validate_data()
+    assert validator.is_file_valid()
+
+def _failed_file(test_file):
+    validator = _get_formatted_validator(test_file)
+    assert validator.validate_file_extension()
+    assert validator.validate_headers()
+    assert not validator.validate_data()
+    assert not validator.is_file_valid()
+
+def _failed_header_file(test_file):
+    validator = _get_formatted_validator(test_file)
+    assert validator.validate_file_extension()
+    validator.header = np.delete(validator.header,np.s_[0,1,2])
+    assert not validator.validate_headers()
+
+
+# Valid file with rsID, chr_name and chr_position
+def test_valid_formatted_file_rsID_and_pos(test_file_1):
+    _valid_file(test_file_1)
+
+# Valid file with rsID only
+def test_valid_formatted_file_rsID_only(test_file_2):
+    _valid_file(test_file_2)
+
+# Valid file with chr_name and chr_position
+def test_valid_formatted_file_pos_only(test_file_3):
+    _valid_file(test_file_3)
+
+# File made invalid file by removing some mandatory column headers
+def test_failed_formatted_file_missing_header(test_file_1):
+    _failed_header_file(test_file_1)
+
+# Invalid file with several data content issues
+def test_failed_formatted_file_data_issues(test_file_4):
+    _failed_file(test_file_4)
+
+
+
+###### Harmonized (Position) scoring files ######
+def _get_hmpos_validator(test_file):
+    validator = hmpos_init_validator(test_file,log_file,None)
+    return validator
+
+def _valid_hmpos_file(test_file):
+    validator = _get_hmpos_validator(test_file)
+    assert validator.validate_file_extension()
+    assert validator.validate_headers()
+    assert validator.validate_data()
+    assert validator.is_file_valid()
+
+def _failed_file(test_file):
+    validator = _get_formatted_validator(test_file)
+    assert validator.validate_file_extension()
+    assert validator.validate_headers()
+    assert not validator.validate_data()
+    assert not validator.is_file_valid()
+
+def _failed_header_file(test_file):
+    validator = _get_formatted_validator(test_file)
+    assert validator.validate_file_extension()
+    validator.header = np.delete(validator.header,np.s_[0,1,2])
+    assert not validator.validate_headers()
+
+
+## GRCh37 ## 
+# Valid file with rsID, chr_name and chr_position
+def test_valid_hmpos_file_rsID_and_pos_37(test_hmpos_file_GRCh37_1):
+    _valid_hmpos_file(test_hmpos_file_GRCh37_1)
+# Valid file with rsID only
+def test_valid_formatted_file_rsID_only_37(test_hmpos_file_GRCh37_2):
+    _valid_hmpos_file(test_hmpos_file_GRCh37_2)
+# Valid file with chr_name and chr_position
+def test_valid_formatted_file_pos_only_37(test_hmpos_file_GRCh37_3):
+    _valid_file(test_hmpos_file_GRCh37_3)
+
+## GRCh38 ##
+# Valid file with rsID, chr_name and chr_position
+def test_valid_hmpos_file_rsID_and_pos_38(test_hmpos_file_GRCh38_1):
+    _valid_hmpos_file(test_hmpos_file_GRCh38_1)
+# Valid file with rsID only
+def test_valid_formatted_file_rsID_only_38(test_hmpos_file_GRCh38_2):
+    _valid_hmpos_file(test_hmpos_file_GRCh38_2)
+# Valid file with chr_name and chr_position
+def test_valid_formatted_file_pos_only_38(test_hmpos_file_GRCh38_3):
+    _valid_file(test_hmpos_file_GRCh38_3)
+
+
+######################################################
+
+@pytest.fixture
+def test_file_1():
+   return './data/test_scoring_file_1.txt.gz'
+
+@pytest.fixture
+def test_file_2():
+   return './data/test_scoring_file_2.txt.gz'
+
+@pytest.fixture
+def test_file_3():
+   return './data/test_scoring_file_3.txt.gz'
+
+@pytest.fixture
+def test_file_4():
+   return './data/test_scoring_file_4.txt.gz'
+
+@pytest.fixture
+def test_hmpos_file_GRCh37_1():
+   return './data/test_scoring_file_hmpos_37_1.txt.gz'
+
+@pytest.fixture
+def test_hmpos_file_GRCh38_1():
+   return './data/test_scoring_file_hmpos_38_1.txt.gz'
+
+@pytest.fixture
+def test_hmpos_file_GRCh37_2():
+   return './data/test_scoring_file_hmpos_37_2.txt.gz'
+
+@pytest.fixture
+def test_hmpos_file_GRCh38_2():
+   return './data/test_scoring_file_hmpos_38_2.txt.gz'
+
+@pytest.fixture
+def test_hmpos_file_GRCh37_3():
+   return './data/test_scoring_file_hmpos_37_3.txt.gz'
+
+@pytest.fixture
+def test_hmpos_file_GRCh38_3():
+   return './data/test_scoring_file_hmpos_38_3.txt.gz'
\ No newline at end of file

From 99eb7b1b88f5bfd36ad0225e1d08ef16b22232c4 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Wed, 14 Sep 2022 14:26:13 +0100
Subject: [PATCH 04/46] Catch ftp download errors

---
 .../download/download_scorefile.py            | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py
index fc35529..f31c7ab 100644
--- a/pgscatalog_utils/download/download_scorefile.py
+++ b/pgscatalog_utils/download/download_scorefile.py
@@ -3,9 +3,11 @@
 import os
 import shutil
 import textwrap
+import time
 from contextlib import closing
 from functools import reduce
 from urllib import request as request
+from urllib.error import HTTPError, URLError
 
 from pgscatalog_utils.download.publication import query_publication
 from pgscatalog_utils.download.score import get_url
@@ -62,14 +64,26 @@ def _mkdir(outdir: str) -> None:
         os.makedirs(outdir)
 
 
-def _download_ftp(url: str, path: str) -> None:
+def _download_ftp(url: str, path: str, retry:int = 0) -> None:
     if os.path.exists(path):
         logger.warning(f"File already exists at {path}, skipping download")
         return
     else:
-        with closing(request.urlopen(url)) as r:
-            with open(path, 'wb') as f:
-                shutil.copyfileobj(r, f)
+        try:
+            with closing(request.urlopen(url)) as r:
+                with open(path, 'wb') as f:
+                    shutil.copyfileobj(r, f)
+        except (HTTPError, URLError) as error:
+            max_retries = 5
+            print(f'Download failed: {error.reason}')
+            # Retry to download the file if the server is busy
+            if '421' in error.reason and retry < max_retries:
+                print(f'> Retry to download the file ... attempt {retry+1} out of {max_retries}.')
+                retry += 1
+                time.sleep(10)
+                _download_ftp(url,path,retry)
+            else:
+                raise RuntimeError("Failed to download '{}'.\nError message: '{}'".format(url, error.reason))
 
 
 def _check_args(args):

From 5fe9fad79cd9c7d7e9c8b53407c2c209de77e33e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 15 Sep 2022 13:26:07 +0100
Subject: [PATCH 05/46] pin polars to 0.14.9

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b9899ab..b8262b2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ pandas = "^1.4.3"
 pyliftover = "^0.4"
 requests = "^2.28.1"
 jq = "^1.2.2"
-polars = "^0.14.9"
+polars = "0.14.9"
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.1.2"

From cd7ee7812236476321dcb1dbccc8147147651f21 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 15 Sep 2022 13:55:38 +0100
Subject: [PATCH 06/46] bump version for next release

---
 pgscatalog_utils/__init__.py | 2 +-
 pyproject.toml               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py
index 10939f0..8ce9b36 100644
--- a/pgscatalog_utils/__init__.py
+++ b/pgscatalog_utils/__init__.py
@@ -1 +1 @@
-__version__ = '0.1.2'
+__version__ = '0.1.3'
diff --git a/pyproject.toml b/pyproject.toml
index b8262b2..65786fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pgscatalog_utils"
-version = "0.1.2"
+version = "0.1.3"
 description = "Utilities for working with PGS Catalog API and scoring files"
 homepage = "https://github.com/PGScatalog/pgscatalog_utils"
 authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>"]

From 96896e36df7edd69068b1c57d5cb6e3381e7502d Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 16 Sep 2022 15:11:00 +0100
Subject: [PATCH 07/46] Handle PGS Catalog REST API errors and retries

---
 pgscatalog_utils/download/publication.py | 10 ++++----
 pgscatalog_utils/download/score.py       | 32 +++++++++++++++++++++---
 pgscatalog_utils/download/trait.py       | 10 ++++----
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py
index 843b8a2..56c7f7b 100644
--- a/pgscatalog_utils/download/publication.py
+++ b/pgscatalog_utils/download/publication.py
@@ -1,20 +1,20 @@
 import logging
 from functools import reduce
 
-import requests
+from pgscatalog_utils.download.score import query_api
 
 logger = logging.getLogger(__name__)
 
 
 def query_publication(pgp: str) -> list[str]:
-    api: str = f'https://www.pgscatalog.org/rest/publication/{pgp}'
     logger.debug("Querying PGS Catalog with publication PGP ID")
-    r: requests.models.Response = requests.get(api)
+    api: str = f'/publication/{pgp}'
+    results_json = query_api(api)
 
-    if r.json() == {}:
+    if results_json == {} or results_json == None:
         logger.critical(f"Bad response from PGS Catalog for EFO term: {pgp}")
         raise Exception
 
-    pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids')
+    pgs: dict[str, list[str]] = results_json.get('associated_pgs_ids')
     logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}")
     return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values()))
diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py
index a38dc0c..edad470 100644
--- a/pgscatalog_utils/download/score.py
+++ b/pgscatalog_utils/download/score.py
@@ -3,6 +3,7 @@
 
 import jq
 import requests
+import time
 
 logger = logging.getLogger(__name__)
 
@@ -28,11 +29,36 @@ def get_url(pgs: list[str], build: str) -> dict[str, str]:
     return dict(zip(pgs_result, url_result))
 
 
+def query_api(api: str, retry:int = 0) -> dict:
+    max_retries = 5
+    wait = 60
+    results_json = None
+    rest_url_root = 'https://www.pgscatalog.org/rest'
+    try:
+        r: requests.models.Response = requests.get(rest_url_root+api)
+        r.raise_for_status()
+        results_json = r.json()
+    except requests.exceptions.HTTPError as e:
+        print(f'HTTP Error: {e}')
+        if r.status_code in [421,429] and retry < 5:
+            retry +=1
+            print(f'> Retry to query the PGS Catalog REST API in {wait}s ... attempt {retry} out of {max_retries}.')
+            time.sleep(wait)
+            results_json = query_api(api,retry)
+    except requests.exceptions.ConnectionError as e:
+        print(f'Error Connecting: {e}')
+    except requests.exceptions.Timeout as e:
+        print(f'Timeout Error: {e}')
+    except requests.exceptions.RequestException as e:
+        print(f'Request Error: {e}')
+    return results_json
+
+
 def query_score(pgs_id: list[str]) -> dict:
     pgs: str = ','.join(pgs_id)
-    api: str = f'https://www.pgscatalog.org/rest/score/search?pgs_ids={pgs}'
-    r: requests.models.Response = requests.get(api)
-    return r.json()
+    api: str = f'/score/search?pgs_ids={pgs}'
+    results_json = query_api(api)
+    return results_json
 
 
 def _chunker(pgs: list[str]):
diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py
index c2db495..83af414 100644
--- a/pgscatalog_utils/download/trait.py
+++ b/pgscatalog_utils/download/trait.py
@@ -1,24 +1,24 @@
 import logging
 from functools import reduce
 
-import requests
+from pgscatalog_utils.download.score import query_api
 
 logger = logging.getLogger(__name__)
 
 
 def query_trait(trait: str) -> list[str]:
-    api: str = f'https://www.pgscatalog.org/rest/trait/{trait}?include_children=1'
     logger.debug(f"Querying PGS Catalog with trait {trait}")
-    r: requests.models.Response = requests.get(api)
+    api: str = f'/trait/{trait}?include_children=1'
+    results_json = query_api(api)
 
-    if r.json() == {}:
+    if results_json == {} or results_json == None:
         logger.critical(f"Bad response from PGS Catalog for EFO term: {trait}")
         raise Exception
 
     keys: list[str] = ['associated_pgs_ids', 'child_associated_pgs_ids']
     pgs: list[str] = []
     for key in keys:
-        pgs.append(r.json().get(key))
+        pgs.append(results_json.get(key))
 
     logger.debug(f"Valid response from PGS Catalog for EFO term: {trait}")
     return list(reduce(lambda x, y: set(x).union(set(y)), pgs))

From 50b9e9e1ee2b7242059e7865fd08e74a1ba460f4 Mon Sep 17 00:00:00 2001
From: Sam Lambert <sam.a.lambert@gmail.com>
Date: Tue, 20 Sep 2022 11:16:23 +0100
Subject: [PATCH 08/46] Update README.md

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d19c186..7c897b8 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,8 @@
 
 [![CI](https://github.com/PGScatalog/pgscatalog_utils/actions/workflows/main.yml/badge.svg)](https://github.com/PGScatalog/pgscatalog_utils/actions/workflows/main.yml)
 
-This repository is a collection of useful tools for working with data from the
-PGS Catalog. This is mostly used internally by the PGS Catalog calculator, but
-other users might find some of these tools helpful.
+This repository is a collection of useful tools for downloading and working with scoring files from the
+PGS Catalog. This is mostly used internally by the PGS Catalog Calculator ([`PGScatalog/pgsc_calc`](https://github.com/PGScatalog/pgsc_calc)); however, other users may find some of these tools helpful.
 
 ## Overview
 
@@ -66,4 +65,4 @@ doi:[10.1038/s41588-021-00783-5](https://doi.org/10.1038/s41588-021-00783-5).
 
 This work has received funding from EMBL-EBI core funds, the Baker Institute, the University of Cambridge, 
 Health Data Research UK (HDRUK), and the European Union's Horizon 2020 research and innovation programme 
-under grant agreement No 101016775 INTERVENE.
\ No newline at end of file
+under grant agreement No 101016775 INTERVENE.

From 268e96aef7f17ea4f72094bc7e0060a3c614ea96 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 21 Sep 2022 14:53:20 +0100
Subject: [PATCH 09/46] add memory profiler to development dependencies

---
 poetry.lock    | 183 ++++++++++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml |   5 +-
 2 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index d776774..b8afbdd 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -28,6 +28,17 @@ category = "main"
 optional = false
 python-versions = ">=3.6"
 
+[[package]]
+name = "cffi"
+version = "1.15.1"
+description = "Foreign Function Interface for Python calling C code."
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pycparser = "*"
+
 [[package]]
 name = "charset-normalizer"
 version = "2.1.0"
@@ -47,6 +58,24 @@ category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
+[[package]]
+name = "contourpy"
+version = "1.0.5"
+description = "Python library for calculating contours of 2D quadrilateral grids"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = ">=1.16"
+
+[package.extras]
+test-no-codebase = ["pillow", "matplotlib", "pytest"]
+test-minimal = ["pytest"]
+test = ["isort", "flake8", "pillow", "matplotlib", "pytest"]
+docs = ["sphinx-rtd-theme", "sphinx", "docutils (<0.18)"]
+bokeh = ["selenium", "bokeh"]
+
 [[package]]
 name = "coverage"
 version = "6.4.4"
@@ -61,6 +90,36 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1
 [package.extras]
 toml = ["tomli"]
 
+[[package]]
+name = "cycler"
+version = "0.11.0"
+description = "Composable style cycles"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "fonttools"
+version = "4.37.3"
+description = "Tools to manipulate font files"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"]
+graphite = ["lz4 (>=1.7.4.2)"]
+interpolatable = ["scipy", "munkres"]
+lxml = ["lxml (>=4.0,<5)"]
+pathops = ["skia-pathops (>=0.5.0)"]
+plot = ["matplotlib"]
+repacker = ["uharfbuzz (>=0.23.0)"]
+symfont = ["sympy"]
+type1 = ["xattr"]
+ufo = ["fs (>=2.2.0,<3)"]
+unicode = ["unicodedata2 (>=14.0.0)"]
+woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"]
+
 [[package]]
 name = "idna"
 version = "3.3"
@@ -85,6 +144,45 @@ category = "main"
 optional = false
 python-versions = ">=3.5"
 
+[[package]]
+name = "kiwisolver"
+version = "1.4.4"
+description = "A fast implementation of the Cassowary constraint solver"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "matplotlib"
+version = "3.6.0"
+description = "Python plotting package"
+category = "dev"
+optional = false
+python-versions = ">=3.8"
+
+[package.dependencies]
+contourpy = ">=1.0.1"
+cycler = ">=0.10"
+fonttools = ">=4.22.0"
+kiwisolver = ">=1.0.1"
+numpy = ">=1.19"
+packaging = ">=20.0"
+pillow = ">=6.2.0"
+pyparsing = ">=2.2.1"
+python-dateutil = ">=2.7"
+setuptools_scm = ">=7"
+
+[[package]]
+name = "memory-profiler"
+version = "0.60.0"
+description = "A module for monitoring memory usage of a python program"
+category = "dev"
+optional = false
+python-versions = ">=3.4"
+
+[package.dependencies]
+psutil = "*"
+
 [[package]]
 name = "numpy"
 version = "1.23.1"
@@ -120,6 +218,18 @@ pytz = ">=2020.1"
 [package.extras]
 test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
 
+[[package]]
+name = "pillow"
+version = "9.2.0"
+description = "Python Imaging Library (Fork)"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"]
+tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
+
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@@ -149,6 +259,17 @@ xlsx2csv = ["xlsx2csv (>=0.8.0)"]
 pytz = ["pytz"]
 pyarrow = ["pyarrow (>=4.0)"]
 
+[[package]]
+name = "psutil"
+version = "5.9.2"
+description = "Cross-platform lib for process and system monitoring in Python."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"]
+
 [[package]]
 name = "py"
 version = "1.11.0"
@@ -157,6 +278,14 @@ category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
+[[package]]
+name = "pycparser"
+version = "2.21"
+description = "C parser in Python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
 [[package]]
 name = "pyliftover"
 version = "0.4"
@@ -257,6 +386,23 @@ urllib3 = ">=1.21.1,<1.27"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "setuptools-scm"
+version = "7.0.5"
+description = "the blessed package to manage your versions by scm tags"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+packaging = ">=20.0"
+tomli = ">=1.0.0"
+typing-extensions = "*"
+
+[package.extras]
+test = ["pytest (>=6.2)", "virtualenv (>20)"]
+toml = ["setuptools (>=42)"]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -273,6 +419,14 @@ category = "dev"
 optional = false
 python-versions = ">=3.7"
 
+[[package]]
+name = "typing-extensions"
+version = "4.3.0"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "urllib3"
 version = "1.26.11"
@@ -286,18 +440,36 @@ brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"]
 secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
+[[package]]
+name = "zstandard"
+version = "0.18.0"
+description = "Zstandard bindings for Python"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\""}
+
+[package.extras]
+cffi = ["cffi (>=1.11)"]
+
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.10"
-content-hash = "607d2d543f52a4ecc116c0b912c499a83cd1c740244323c81fdfe89ba27a55eb"
+content-hash = "a0d60a1fec35d248340f1640db49d07a7000b23e4bbe22426a9c240ee499c334"
 
 [metadata.files]
 atomicwrites = []
 attrs = []
 certifi = []
+cffi = []
 charset-normalizer = []
 colorama = []
+contourpy = []
 coverage = []
+cycler = []
+fonttools = []
 idna = [
     {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
     {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
@@ -307,6 +479,9 @@ iniconfig = [
     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
 ]
 jq = []
+kiwisolver = []
+matplotlib = []
+memory-profiler = []
 numpy = []
 packaging = []
 pandas = [
@@ -332,15 +507,18 @@ pandas = [
     {file = "pandas-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e"},
     {file = "pandas-1.4.3.tar.gz", hash = "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c"},
 ]
+pillow = []
 pluggy = [
     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
 ]
 polars = []
+psutil = []
 py = [
     {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
     {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
+pycparser = []
 pyliftover = [
     {file = "pyliftover-0.4.tar.gz", hash = "sha256:72bcfb7de907569b0eb75e86c817840365297d63ba43a961da394187e399da41"},
 ]
@@ -357,6 +535,7 @@ pytz = [
     {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
 ]
 requests = []
+setuptools-scm = []
 six = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -365,4 +544,6 @@ tomli = [
     {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
+typing-extensions = []
 urllib3 = []
+zstandard = []
diff --git a/pyproject.toml b/pyproject.toml
index 65786fe..23caf20 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,12 +18,15 @@ pandas = "^1.4.3"
 pyliftover = "^0.4"
 requests = "^2.28.1"
 jq = "^1.2.2"
-polars = "0.14.9"
+polars = "^0.14.9"
+zstandard = "^0.18.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.1.2"
 pytest-cov = "^3.0.0"
 pysqlar = "^0.1.2"
+memory-profiler = "^0.60.0"
+matplotlib = "^3.6.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

From 012ff6de7dc076d5218073b7af4a48f522f6c135 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 21 Sep 2022 14:57:57 +0100
Subject: [PATCH 10/46] add support for reading zstd compressed targets

---
 pgscatalog_utils/match/preprocess.py |  6 +-
 pgscatalog_utils/match/read.py       | 98 +++++-----------------------
 pgscatalog_utils/target.py           | 87 ++++++++++++++++++++++++
 3 files changed, 107 insertions(+), 84 deletions(-)
 create mode 100644 pgscatalog_utils/target.py

diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index 1723f6d..4d93090 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -27,7 +27,7 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF
     return df
 
 
-def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) -> pl.DataFrame:
+def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format: str) -> pl.DataFrame:
     # plink2 pvar multi-alleles are comma-separated
     df: pl.DataFrame = (df.with_column(
         pl.when(pl.col("ALT").str.contains(','))
@@ -35,10 +35,10 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool)
         .otherwise(pl.lit(False))
         .alias('is_multiallelic')))
 
-    if df['is_multiallelic'].sum() > 0:
+    if df.select('is_multiallelic').sum() > 0:
         logger.debug("Multiallelic variants detected")
         if remove_multiallelic:
-            if not pvar:
+            if file_format == "bim":
                 logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic "
                                "variant representations only")
             logger.debug('Dropping multiallelic variants')
diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index fd1a4c3..c25175a 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -1,104 +1,40 @@
 import glob
 import logging
-from typing import NamedTuple
 
 import polars as pl
 
 from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles
+from pgscatalog_utils.target import Target
 
 logger = logging.getLogger(__name__)
 
 
-def read_target(path: str, remove_multiallelic: bool, single_file: bool = False,
-                chrom: str = "") -> pl.DataFrame:
-    target: Target = _detect_target_format(path)
-    d = {'column_1': str}  # column_1 is always CHROM. CHROM must always be a string
+def read_target(path: str, remove_multiallelic: bool) -> pl.DataFrame:
+    """ Read one or more targets from a path (may contain a wildcard) """
 
-    if single_file:
-        logger.debug(f"Scanning target genome for chromosome {chrom}")
-        # scan target and filter to reduce memory usage on big files
-        df: pl.DataFrame = (
-            pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d)
-            .filter(pl.col('column_1') == chrom)
-            .collect())
-
-        if df.is_empty():
-            logger.warning(f"Chromosome missing from target genome: {chrom}")
-            return df
+    if '*' in path:
+        logger.debug("Wildcard detected in target path: finding all matching files")
+        paths: list[str] = glob.glob(path)
     else:
-        logger.debug(f"Reading target {path}")
-        df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d)
+        logger.debug("")
+        paths: list[str] = [path]
 
-    df.columns = target.header
+    targets: list[Target] = [Target.from_path(x) for x in paths]
+    dfs: list[pl.DataFrame] = []
+    for target in targets:
+        assert target.file_format in ['bim', 'pvar']
+        dfs.append(target.read().pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic,
+                                      file_format=target.file_format))
 
-    match target.file_format:
-        case 'bim':
-            return (df.select(_default_cols())
-                    .filter(pl.col('ID') != '.')  # remove missing IDs
-                    .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=False))
-        case 'pvar':
-            return (df.select(_default_cols())
-                    .filter(pl.col('ID') != '.')
-                    .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=True))
-        case _:
-            logger.error("Invalid file format detected")
-            raise Exception
+    return pl.concat(dfs).filter(pl.col("ID") != '.')
 
 
 def read_scorefile(path: str) -> pl.DataFrame:
     logger.debug("Reading scorefile")
     scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str})
-                               .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])
-                               .with_columns([
+    .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])
+    .with_columns([
         pl.col('accession').cast(pl.Categorical),
         pl.col("effect_type").cast(pl.Categorical)]))
 
     return scorefile
-
-
-class Target(NamedTuple):
-    """ Important summary information about a target genome. Cheap to compute (just reads the header). """
-    file_format: str
-    header: list[str]
-
-
-def _detect_target_format(path: str) -> Target:
-    file_format: str
-    header: list[str]
-
-    if "*" in path:
-        logger.debug("Detecting target file format")
-        path = glob.glob(path)[0]  # guess format from first file in directory
-
-    with open(path, 'rt') as f:
-        for line in f:
-            if line.startswith('#'):
-                logger.debug("pvar format detected")
-                file_format = 'pvar'
-                header = _pvar_header(path)
-                break
-            else:
-                logger.debug("bim format detected")
-                file_format = 'bim'
-                header = _bim_header()
-                break
-
-    return Target(file_format, header)
-
-
-def _default_cols() -> list[str]:
-    return ['#CHROM', 'POS', 'ID', 'REF', 'ALT']  # only columns we want from a target genome
-
-
-def _pvar_header(path: str) -> list[str]:
-    """ Get the column names from the pvar file (not constrained like bim, especially when converted from VCF) """
-    line: str = '#'
-    with open(path, 'rt') as f:
-        while line.startswith('#'):
-            line: str = f.readline()
-            if line.startswith('#CHROM'):
-                return line.strip().split('\t')
-
-
-def _bim_header() -> list[str]:
-    return ['#CHROM', 'ID', 'CM', 'POS', 'REF', 'ALT']
diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py
new file mode 100644
index 0000000..6b28998
--- /dev/null
+++ b/pgscatalog_utils/target.py
@@ -0,0 +1,87 @@
+import zstandard
+from dataclasses import dataclass
+import io
+import logging
+import polars as pl
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Target:
+    """ Class to detect and read a plink1/plink2 variant information file """
+    file_format: str = None
+    header: list[str] = None
+    path: str = None
+    compressed: bool = False
+
+    @classmethod
+    def from_path(cls, path):
+        """ Create a Target object from a path. Cheaply detect file format and headers. """
+        try:
+            with open(path, 'r') as f:
+                file_format, header = _get_header(f)
+                compressed = False
+        except UnicodeDecodeError:
+            logger.error("Can't open target as a text file, so trying to read zstd compressed binary file")
+            with open(path, 'rb') as f:
+                dctx = zstandard.ZstdDecompressor()
+                stream_reader = dctx.stream_reader(f)
+                text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
+                file_format, header = _get_header(text_stream)
+                compressed = True
+
+        return cls(file_format=file_format, path=path, header=header, compressed=compressed)
+
+    def read(self) -> pl.DataFrame:
+        """ Read variant information into a polars df (expensive operation). Automatically handle compressed data.  """
+        # column_1 is always CHROM, which must always be a string or X/Y/MT/PAR will break inferred dtypes
+        logger.debug("Reading target into memory")
+        chrom_dtype = {'column_1': str}
+        if self.compressed:
+            with open(self.path, 'rb') as f:
+                dctx = zstandard.ZstdDecompressor()
+                with dctx.stream_reader(f) as reader:
+                    df: pl.DataFrame = pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', dtype=chrom_dtype)
+                    df.columns = self.header
+                    return df.select(_default_cols())
+        else:
+            df: pl.DataFrame = pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', dtype=chrom_dtype)
+            df.columns = self.header
+            return df.select(_default_cols())
+
+
+def _get_header(fh) -> tuple[str, list[str]]:
+    header = None
+    file_format = None
+    logger.debug(f"Scanning header to get file format and column names")
+    for line in fh:
+        if line.startswith('#'):
+            logger.debug("pvar format detected")
+            file_format = 'pvar'
+            header = _pvar_header(fh)
+            break
+        else:
+            logger.debug("bim format detected")
+            file_format = 'bim'
+            header = _bim_header()
+            break
+
+    return file_format, header
+
+
+def _pvar_header(fh) -> list[str]:
+    """ Get the column names from the pvar file (not constrained like bim, especially when converted from VCF) """
+    line: str = '#'
+    while line.startswith('#'):
+        line: str = fh.readline()
+        if line.startswith('#CHROM'):
+            return line.strip().split('\t')
+
+
+def _bim_header() -> list[str]:
+    return ['#CHROM', 'ID', 'CM', 'POS', 'REF', 'ALT']
+
+
+def _default_cols() -> list[str]:
+    return ['#CHROM', 'POS', 'ID', 'REF', 'ALT']  # only columns we want from a target genome

From 79759510f95e5a448517b05c20d8d2dfd737be9e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 21 Sep 2022 14:58:10 +0100
Subject: [PATCH 11/46] remove single match mode, scan_csv not compatible with
 bytesIO (zstd)

---
 pgscatalog_utils/match/match_variants.py | 31 ++++--------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 336d781..dd4ec4e 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -6,9 +6,9 @@
 import polars as pl
 
 from pgscatalog_utils.log_config import set_logging_level
+from pgscatalog_utils.match.filter import filter_scores
 from pgscatalog_utils.match.log import make_logs
 from pgscatalog_utils.match.match import get_all_matches
-from pgscatalog_utils.match.filter import filter_scores
 from pgscatalog_utils.match.read import read_target, read_scorefile
 from pgscatalog_utils.match.write import write_out, write_log
 
@@ -28,18 +28,12 @@ def match_variants():
         n_target_files = len(glob(args.target))
         matches: pl.DataFrame
 
-        if n_target_files == 1 and not args.fast:
-            match_mode: str = 'single'
-        elif n_target_files > 1 and not args.fast:
+        if n_target_files > 1 and not args.fast:
             match_mode: str = 'multi'
-        elif args.fast:
+        else:
             match_mode: str = 'fast'
 
         match match_mode:
-            case "single":
-                logger.debug(f"Match mode: {match_mode}")
-                matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
-                                               args.remove_ambiguous, args.keep_first_match)
             case "multi":
                 logger.debug(f"Match mode: {match_mode}")
                 matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
@@ -81,8 +75,7 @@ def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic:
     # fast match is fast because:
     #   1) all target files are read into memory
     #   2) matching occurs without iterating through chromosomes
-    target: pl.DataFrame = read_target(path=target_path,
-                                       remove_multiallelic=remove_multiallelic)
+    target: pl.DataFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic)
     logger.debug("Split target chromosomes not checked with fast match mode")
     return get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match)
 
@@ -92,26 +85,12 @@ def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_mu
     matches = []
     for i, loc_target_current in enumerate(glob(target_path)):
         logger.debug(f'Matching scorefile(s) against target: {loc_target_current}')
-        target: pl.DataFrame = read_target(path=loc_target_current,
-                                           remove_multiallelic=remove_multiallelic)
+        target: pl.DataFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic)
         _check_target_chroms(target)
         matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match))
     return pl.concat(matches)
 
 
-def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                         skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
-    matches = []
-    for chrom in scorefile['chr_name'].unique().to_list():
-        target = read_target(target_path, remove_multiallelic=remove_multiallelic,
-                             single_file=True, chrom=chrom)  # scans and filters
-        if target:
-            logger.debug(f"Matching chromosome {chrom}")
-            matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match))
-
-    return pl.concat(matches)
-
-
 def _description_text() -> str:
     return textwrap.dedent('''\
     Match variants from a combined scoring file against a set of

From b6aa2b0899817c518056480f50677e7d9b477a38 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 21 Sep 2022 15:36:04 +0100
Subject: [PATCH 12/46] compress matched scorefiles

---
 pgscatalog_utils/match/write.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py
index 53eb15f..52253a3 100644
--- a/pgscatalog_utils/match/write.py
+++ b/pgscatalog_utils/match/write.py
@@ -37,9 +37,11 @@ def _write_scorefile(effect_type: str, scorefiles: pl.DataFrame, split: bool, ou
 
         for k, v in df_dict.items():
             chr = k.replace("false", "ALL")
-            path: str = os.path.join(outdir, f"{dataset}_{chr}_{effect_type}_{i}.scorefile")
+            path: str = os.path.join(outdir, f"{dataset}_{chr}_{effect_type}_{i}.scorefile.gz")
             logger.debug(f"Writing matched scorefile to {path}")
-            v.write_csv(path, sep="\t")
+
+            with gzip.open(path, 'wb') as f:
+                v.write_csv(f, sep="\t")
 
 
 def _format_scorefile(df: pl.DataFrame, split: bool) -> dict[str, pl.DataFrame]:

From 92bd91ef515f023d06a61f01a6853ce280b0ab85 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 21 Sep 2022 16:35:45 +0100
Subject: [PATCH 13/46] skeleton aggreggation

---
 pgscatalog_utils/aggregate/__init__.py |  0
 pgscatalog_utils/score.py              | 12 ++++++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 pgscatalog_utils/aggregate/__init__.py
 create mode 100644 pgscatalog_utils/score.py

diff --git a/pgscatalog_utils/aggregate/__init__.py b/pgscatalog_utils/aggregate/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog_utils/score.py b/pgscatalog_utils/score.py
new file mode 100644
index 0000000..e0e8305
--- /dev/null
+++ b/pgscatalog_utils/score.py
@@ -0,0 +1,12 @@
+import zstandard
+from dataclasses import dataclass
+import io
+import logging
+import polars as pl
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Score:
+    """ A class that represents calculated scores (.sscore)"""

From da0105e19f3407ef910523b6a57cfeb49ce645ab Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 22 Sep 2022 11:23:28 +0100
Subject: [PATCH 14/46] add aggregate_score

---
 .../aggregate/aggregate_scores.py             | 92 +++++++++++++++++++
 pgscatalog_utils/score.py                     | 12 ---
 pyproject.toml                                |  1 +
 3 files changed, 93 insertions(+), 12 deletions(-)
 create mode 100644 pgscatalog_utils/aggregate/aggregate_scores.py
 delete mode 100644 pgscatalog_utils/score.py

diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py
new file mode 100644
index 0000000..2787680
--- /dev/null
+++ b/pgscatalog_utils/aggregate/aggregate_scores.py
@@ -0,0 +1,92 @@
+import argparse
+import textwrap
+
+import pandas as pd
+
+from pgscatalog_utils.log_config import set_logging_level
+import glob
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def aggregate_scores():
+    args = _parse_args()
+    set_logging_level(args.verbose)
+    df = aggregate(glob.glob(args.scores))
+    logger.debug("Compressing and writing combined scores")
+    df.to_csv('aggregated_scores.txt.gz', sep='\t', compression='gzip')
+
+
+def aggregate(scorefiles: list[str]):
+    combined = pd.DataFrame()
+    aggcols = set()
+
+    for i, path in enumerate(scorefiles):
+        logger.debug(f"Reading {path}")
+        # pandas can automatically detect zst compression, neat!
+        df = (pd.read_table(path)
+              .assign(sampleset=path.split('_')[0])
+              .set_index(['sampleset', '#IID']))
+
+        df.index.names = ['sampleset', 'IID']
+
+        # Subset to aggregatable columns
+        df = df[_select_agg_cols(df.columns)]
+        aggcols.update(set(df.columns))
+
+        # Combine DFs
+        if i == 0:
+            logger.debug('Initialising combined DF')
+            combined = df.copy()
+        else:
+            logger.debug('Adding to combined DF')
+            combined = combined.add(df, fill_value=0)
+
+    assert all([x in combined.columns for x in aggcols]), "All Aggregatable Columns are present in the final DF"
+
+    return combined.pipe(_calculate_average)
+
+
+def _calculate_average(combined: pd.DataFrame):
+    logger.debug("Averaging data")
+    avgs = combined.loc[:, combined.columns.str.endswith('_SUM')].divide(combined['DENOM'], axis=0)
+    avgs.columns = avgs.columns.str.replace('_SUM', '_AVG')
+    return pd.concat([combined, avgs], axis=1)
+
+
+def _select_agg_cols(cols):
+    keep_cols = ['DENOM']
+    return [x for x in cols if (x.endswith('_SUM') and (x != 'NAMED_ALLELE_DOSAGE_SUM')) or (x in keep_cols)]
+
+
+def _description_text() -> str:
+    return textwrap.dedent('''
+    Aggregate plink .sscore files into a combined TSV table.
+    
+    This aggregation sums scores that were calculated from plink
+    .scorefiles. Scorefiles may be split to calculate scores over different
+    chromosomes or effect types. The PGS Catalog calculator automatically splits
+    scorefiles where appropriate, and uses this script to combine them.
+    
+    Input .sscore files can be optionally compressed with zstd or gzip. 
+    
+    The aggregated output scores are compressed with gzip.
+   ''')
+
+
+def _parse_args(args=None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=_description_text(),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-s', '--scores', dest='scores', required=True,
+                        help='<Required> Path to scorefiles. Use a wildcard (*) to select multiple files.')
+    parser.add_argument('-o', '--outdir', dest='outdir', required=True,
+                        default='scores/', help='<Required> Output directory to store downloaded files')
+    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
+                        help='<Optional> Extra logging information')
+    return parser.parse_args(args)
+
+
+if __name__ == "__main__":
+    aggregate_scores()
+
diff --git a/pgscatalog_utils/score.py b/pgscatalog_utils/score.py
deleted file mode 100644
index e0e8305..0000000
--- a/pgscatalog_utils/score.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import zstandard
-from dataclasses import dataclass
-import io
-import logging
-import polars as pl
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Score:
-    """ A class that represents calculated scores (.sscore)"""
diff --git a/pyproject.toml b/pyproject.toml
index 23caf20..9c7bbb5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ readme = "README.md"
 combine_scorefiles = "pgscatalog_utils.scorefile.combine_scorefiles:combine_scorefiles"
 download_scorefiles = "pgscatalog_utils.download.download_scorefile:download_scorefile"
 match_variants = "pgscatalog_utils.match.match_variants:match_variants"
+aggregate_scores = "pgscatalog_utils.aggregate.aggregate_scores:aggregate_scores"
 
 [tool.poetry.dependencies]
 python = "^3.10"

From 85715dc93ff1fe5b6c0d3202bb178cba32694fbe Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Thu, 22 Sep 2022 14:32:22 +0100
Subject: [PATCH 15/46] Cleanup the code, ignore some user warnings from
 pandas_schema and updatepoetry files

---
 .../validate/formatted/validator.py           |  34 ++--
 .../validate/harmonized_position/validator.py |   2 +-
 pgscatalog_utils/validate/schemas.py          |   1 -
 .../validate/validate_scorefile.py            | 177 ++++++++----------
 pgscatalog_utils/validate/validator_base.py   |  26 ++-
 poetry.lock                                   |  13 ++
 pyproject.toml                                |   3 +
 7 files changed, 127 insertions(+), 129 deletions(-)

diff --git a/pgscatalog_utils/validate/formatted/validator.py b/pgscatalog_utils/validate/formatted/validator.py
index eda02cc..1e42336 100644
--- a/pgscatalog_utils/validate/formatted/validator.py
+++ b/pgscatalog_utils/validate/formatted/validator.py
@@ -3,8 +3,6 @@
 from pandas_schema import Schema
 from pgscatalog_utils.validate.schemas import *
 from pgscatalog_utils.validate.validator_base import *
-# from schemas import *
-# from validator_base import *
 
 '''
 PGS Catalog Harmonized file validator
@@ -17,7 +15,7 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
         super().__init__(file, score_dir, logfile, error_limit)
         self.score_dir=None
         self.meta_format = FORMATTED_META_GENERIC
-        self.validators = FORMATTED_VALIDATORS
+        self.schema_validators = FORMATTED_VALIDATORS
         self.valid_cols = VALID_COLS_FORMATTED
         self.valid_type = VALID_TYPE_FORMATTED
         self.setup_field_validation()
@@ -91,32 +89,28 @@ def validate_data(self) -> bool:
         self.get_and_check_variants_number()
 
         for chunk in self.df_iterator(self.file):
-            to_validate = chunk[self.cols_to_read]
-            to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded
+            dataframe_to_validate = chunk[self.cols_to_read]
+            dataframe_to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded
 
             # Detect duplicated rows
-            self.detect_duplicated_rows(to_validate)
+            self.detect_duplicated_rows(dataframe_to_validate)
+
             # validate the snp column if present
             if SNP_DSET in self.header:
+                sub_schema = FORMATTED_VALIDATORS_SNP
                 if CHR_DSET and BP_DSET in self.header:
-                    self.schema = Schema([FORMATTED_VALIDATORS_SNP_EMPTY[h] for h in self.cols_to_validate])
-                else:
-                    self.schema = Schema([FORMATTED_VALIDATORS_SNP[h] for h in self.cols_to_validate])
-                errors = self.schema.validate(to_validate)
-                self.store_errors(errors)
+                    sub_schema = FORMATTED_VALIDATORS_SNP_EMPTY
+                self.validate_schema(sub_schema,dataframe_to_validate)
 
             if CHR_DSET and BP_DSET in self.header:
-                self.schema = Schema([FORMATTED_VALIDATORS_POS[h] for h in self.cols_to_validate])
-                errors = self.schema.validate(to_validate)
-                self.store_errors(errors)
+                self.validate_schema(FORMATTED_VALIDATORS_POS, dataframe_to_validate)
+
             if OR_DSET in self.header:
-                self.schema = Schema([FORMATTED_VALIDATORS_OR[h] for h in self.cols_to_validate])
-                errors = self.schema.validate(to_validate)
-                self.store_errors(errors)
+                self.validate_schema(FORMATTED_VALIDATORS_OR,dataframe_to_validate)
+
             if HR_DSET in self.header:
-                self.schema = Schema([FORMATTED_VALIDATORS_HR[h] for h in self.cols_to_validate])
-                errors = self.schema.validate(to_validate)
-                self.store_errors(errors)
+                self.validate_schema(FORMATTED_VALIDATORS_HR,dataframe_to_validate)
+
             self.process_errors()
             if len(self.bad_rows) >= self.error_limit:
                 break
diff --git a/pgscatalog_utils/validate/harmonized_position/validator.py b/pgscatalog_utils/validate/harmonized_position/validator.py
index b46e8c4..87b9346 100644
--- a/pgscatalog_utils/validate/harmonized_position/validator.py
+++ b/pgscatalog_utils/validate/harmonized_position/validator.py
@@ -13,7 +13,7 @@ class ValidatorPos(ValidatorBase):
     def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
         super().__init__(file, score_dir, logfile, error_limit)
         self.meta_format = HM_META_POS
-        self.validators = POS_VALIDATORS
+        self.schema_validators = POS_VALIDATORS
         self.valid_cols = VALID_COLS_POS
         self.valid_type = VALID_TYPE_POS
         self.setup_field_validation()
diff --git a/pgscatalog_utils/validate/schemas.py b/pgscatalog_utils/validate/schemas.py
index 7487b21..43e8e27 100644
--- a/pgscatalog_utils/validate/schemas.py
+++ b/pgscatalog_utils/validate/schemas.py
@@ -1,4 +1,3 @@
-import sys
 import numpy as np
 from pandas_schema import Column
 from pandas_schema.validation import MatchesPatternValidation, InListValidation, CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation, CustomElementValidation
diff --git a/pgscatalog_utils/validate/validate_scorefile.py b/pgscatalog_utils/validate/validate_scorefile.py
index 3e38bf4..80294c3 100644
--- a/pgscatalog_utils/validate/validate_scorefile.py
+++ b/pgscatalog_utils/validate/validate_scorefile.py
@@ -1,6 +1,7 @@
 import os, glob, re
 import argparse
 import logging
+import textwrap
 
 data_sum = {'valid': [], 'invalid': [], 'other': []}
 
@@ -8,6 +9,55 @@
 
 logging.basicConfig(level=logging.INFO, format='(%(levelname)s): %(message)s')
 
+
+def validate_scorefile() -> None:
+    global data_sum, score_dir
+    args = _parse_args()
+    _check_args(args)
+
+    # Check PGS Catalog file name nomenclature
+    check_filename = False
+    if args.check_filename:
+        check_filename = True
+    else:
+        print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.")
+
+    validator_type = args.t
+    files_dir = args.dir
+    log_dir = args.log_dir
+
+    ## Select validator class ##
+    if validator_type == 'formatted':
+        import pgscatalog_utils.validate.formatted.validator as validator_package
+    elif validator_type == 'hm_pos':
+        import pgscatalog_utils.validate.harmonized_position.validator as validator_package
+
+    ## Run validator ##
+    # One file
+    if args.f:
+        _run_validator(args.f,log_dir,score_dir,validator_package,check_filename,validator_type)
+    # Content of the directory
+    elif files_dir:
+        count_files = 0
+        # Browse directory: for each file run validator
+        for filepath in sorted(glob.glob(files_dir+"/*.*")):
+            _run_validator(filepath,log_dir,score_dir,validator_package,check_filename,validator_type)
+            count_files += 1
+
+        # Print summary  + results
+        print("\nSummary:")
+        if data_sum['valid']:
+            print(f"- Valid: {len(data_sum['valid'])}/{count_files}")
+        if data_sum['invalid']:
+            print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}")
+        if data_sum['other']:
+            print(f"- Other issues: {len(data_sum['other'])}/{count_files}")
+
+        if data_sum['invalid']:
+            print("Invalid files:")
+            print("\n".join(data_sum['invalid']))
+
+
 def _read_last_line(file: str) -> str:
     '''
     Return the last line of the file
@@ -36,47 +86,7 @@ def _file_validation_state(filename: str, log_file: str) -> None:
         data_sum['other'].append(filename)
 
 
-def _run_validator(validator: object, file: str, check_filename: bool, logfile: str, validator_type: str) -> None:
-    ''' Main method to run the PGS file validator '''
-    if check_filename:
-        validator.run_validator()
-    else:
-        validator.run_validator_skip_check_filename()
-    # validator.logger.propagate = False
-
-    # # Check files exist
-    # if not file or not logfile:
-    #     validator.logger.info("Missing file and/or logfile")
-    #     validator.set_file_is_invalid()
-    # elif file and not os.path.exists(file):
-    #     validator.logger.info("Error: the file '"+file+"' can't be found")
-    #     validator.set_file_is_invalid()
-
-    # # Validate file extension
-    # validator.validate_file_extension()
-
-    # # Validate file name nomenclature
-    # if validator.is_file_valid() and check_filename:
-    #     validator.validate_filename()
-
-    # # Only for harmonized files
-    # if validator.is_file_valid() and validator_type != 'formatted':
-    #     validator.compare_with_filename()
-
-    # # Validate column headers
-    # if validator.is_file_valid():
-    #     validator.validate_headers()
-
-    # # Validate data content
-    # if validator.is_file_valid():
-    #     validator.validate_data()
-
-    # # Close log handler
-    # validator.logger.removeHandler(validator.handler)
-    # validator.handler.close()
-
-
-def _check_args(args):
+def _check_args(args: argparse.Namespace) -> None:
     global score_dir
 
     ## Check parameters ##
@@ -112,79 +122,50 @@ def _check_args(args):
         print("WARNING: the parameter '--score_dir' is not present in the submitted command line, therefore the comparison of the number of data rows between the formatted scoring file(s) and the harmonized scoring file(s) won't be performed.")
 
 
-def validate_file(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None:
+def _run_validator(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None:
     ''' Run the file validator '''
     file = os.path.basename(filepath)
     filename = file.split('.')[0]
     print(f"# Filename: {file}")
-    log_file = log_dir+'/'+filename+'_log.txt'
+    log_file = f'{log_dir}/{filename}_log.txt'
 
     # Run validator
     validator = validator_package.init_validator(filepath,log_file,score_dir)
-    _run_validator(validator,filepath,check_filename,log_file,validator_type)
+    if check_filename:
+        validator.run_validator()
+    else:
+        validator.run_validator_skip_check_filename()
 
     # Check log
     _file_validation_state(file,log_file)
 
 
-def main():
-    global data_sum, score_dir
+def _description_text() -> str:
+    return textwrap.dedent('''\
+    Validate a set of scoring files to match the PGS Catalog scoring file formats.
+    It can validate:
+    - The formatted scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring)
+    - The harmonized (Position) scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring_hm_pos)
+   ''')
 
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE')
-    argparser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME')
-    argparser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option')
-    argparser.add_argument('--score_dir', help='<Optional> The name of the directory containing the formatted scoring files to compare with harmonized scoring files')
-    argparser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True)
-    argparser.add_argument('--check_filename', help='<Optional> Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true')
-   
-    args = argparser.parse_args()
-    
-    ## Check parameters ##
-    _check_args(args)
 
-    # Check PGS Catalog file name nomenclature
-    check_filename = False
-    if args.check_filename:
-        check_filename = True
-    else:
-        print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.")
+def _epilog_text() -> str:
+    return textwrap.dedent(f'''\
+    You need to specify the type of file format to validate, using the paramter '-t' ({' or '.join(val_types)}).
+   ''')
 
-    validator_type = args.t
-    files_dir = args.dir
 
-    log_dir = args.log_dir
+def _parse_args(args=None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE')
+    parser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME')
+    parser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option')
+    parser.add_argument('--score_dir', help='<Optional> The name of the directory containing the formatted scoring files to compare with harmonized scoring files')
+    parser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True)
+    parser.add_argument('--check_filename', help='<Optional> Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true')
+    return parser.parse_args(args)
 
-    ## Select validator class ##
-    if validator_type == 'formatted':
-        import pgscatalog_utils.validate.formatted.validator as validator_package
-    elif validator_type == 'hm_pos':
-        import pgscatalog_utils.validate.harmonized_position.validator as validator_package
-
-    ## Run validator ##
-    # One file
-    if args.f:
-        validate_file(args.f,log_dir,score_dir,validator_package,check_filename,validator_type)
-    # Content of the directory
-    elif files_dir:
-        count_files = 0
-        # Browse directory: for each file run validator
-        for filepath in sorted(glob.glob(files_dir+"/*.*")):
-            validate_file(filepath,log_dir,score_dir,validator_package,check_filename,validator_type)
-            count_files += 1
-
-        # Print summary  + results
-        print("\nSummary:")
-        if data_sum['valid']:
-            print(f"- Valid: {len(data_sum['valid'])}/{count_files}")
-        if data_sum['invalid']:
-            print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}")
-        if data_sum['other']:
-            print(f"- Other issues: {len(data_sum['other'])}/{count_files}")
-
-        if data_sum['invalid']:
-            print("Invalid files:")
-            print("\n".join(data_sum['invalid']))
 
 if __name__ == '__main__':
-    main()
+    validate_scorefile()
diff --git a/pgscatalog_utils/validate/validator_base.py b/pgscatalog_utils/validate/validator_base.py
index 80af5c4..ddfbc59 100644
--- a/pgscatalog_utils/validate/validator_base.py
+++ b/pgscatalog_utils/validate/validator_base.py
@@ -7,10 +7,8 @@
 from typing import List
 import pandas as pd
 import pandas_schema
-from pgscatalog_utils.validate.schemas import *
 import warnings
-
-warnings.filterwarnings('ignore', category=UserWarning, module='pandas_schema')
+from pgscatalog_utils.validate.schemas import *
 
 '''
 PGS Catalog file validator
@@ -23,7 +21,7 @@
 class ValidatorBase:
 
     valid_extensions = VALID_FILE_EXTENSIONS
-    validators = GENERIC_VALIDATORS
+    schema_validators = GENERIC_VALIDATORS
     valid_cols = []
     valid_type = ''
     sep = '\t'
@@ -55,6 +53,18 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
         self.variants_number = 0
 
 
+    def validate_schema(self, schema: dict, dataframe_to_validate: pd.core.frame.DataFrame):
+        '''
+        Run the pandas_schema validation using the provided Schema and DataFrame
+        '''
+        self.schema = pandas_schema.Schema([schema[h] for h in self.cols_to_validate])
+        with warnings.catch_warnings():
+            # Ignore python warningd raised in the pandas_schema code
+            warnings.simplefilter('ignore', UserWarning)
+            errors = self.schema.validate(dataframe_to_validate)
+            self.store_errors(errors)
+
+
     def setup_field_validation(self):
         '''
         Fetch the header and build the list of column to check/validate
@@ -146,13 +156,11 @@ def validate_data(self) -> bool:
         # Validate data content and check the consitence between the declared variants number and the actual number of variants in the file
         self.validate_content()
         for chunk in self.df_iterator(self.file):
-            to_validate = chunk[self.cols_to_read]
-            to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded
+            dataframe_to_validate = chunk[self.cols_to_read]
+            dataframe_to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded
 
             # Schema validation
-            self.schema = pandas_schema.Schema([self.validators[h] for h in self.cols_to_validate])                
-            errors = self.schema.validate(to_validate)
-            self.store_errors(errors)
+            self.validate_schema(self.schema_validators,dataframe_to_validate)
 
             self.process_errors()
             if len(self.bad_rows) >= self.error_limit:
diff --git a/poetry.lock b/poetry.lock
index e920a73..c3a2742 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -120,6 +120,19 @@ pytz = ">=2020.1"
 [package.extras]
 test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
 
+[[package]]
+name = "pandas-schema"
+version = "0.3.6"
+description = "A validation library for Pandas data frames using user-friendly schemas"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+numpy = "*"
+packaging = "*"
+pandas = ">=0.19"
+
 [[package]]
 name = "pluggy"
 version = "1.0.0"
diff --git a/pyproject.toml b/pyproject.toml
index 44ef233..e23d84b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,10 +11,13 @@ readme = "README.md"
 combine_scorefiles = "pgscatalog_utils.scorefile.combine_scorefiles:combine_scorefiles"
 download_scorefiles = "pgscatalog_utils.download.download_scorefile:download_scorefile"
 match_variants = "pgscatalog_utils.match.match_variants:match_variants"
+validate_scorefiles = "pgscatalog_utils.validate.validate_scorefile:validate_scorefile"
 
 [tool.poetry.dependencies]
 python = "^3.10"
+numpy = "^1.23.3"
 pandas = "^1.4.3"
+pandas-schema = "^0.3.6"
 pyliftover = "^0.4"
 requests = "^2.28.1"
 jq = "^1.2.2"

From 60e3d9f174bb0b501037d0da688607a067cb83eb Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Thu, 22 Sep 2022 14:43:00 +0100
Subject: [PATCH 16/46] Attempt to fix poetry error

---
 poetry.lock | 1 +
 1 file changed, 1 insertion(+)

diff --git a/poetry.lock b/poetry.lock
index c3a2742..7f2a58e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -344,6 +344,7 @@ pandas = [
     {file = "pandas-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e"},
     {file = "pandas-1.4.3.tar.gz", hash = "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c"},
 ]
+pandas-schema = []
 pluggy = [
     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},

From f6d727f94e90d8f17bac3d9e311a5cea5d7f42de Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Thu, 22 Sep 2022 14:52:35 +0100
Subject: [PATCH 17/46] Fix version discrepancies for numpy

---
 poetry.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/poetry.lock b/poetry.lock
index 7f2a58e..0cebbc4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -87,7 +87,7 @@ python-versions = ">=3.5"
 
 [[package]]
 name = "numpy"
-version = "1.23.1"
+version = "1.23.3"
 description = "NumPy is the fundamental package for array computing with Python."
 category = "main"
 optional = false

From 13216fed7dd81109d2a24b60ec0abd8b77b33873 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Thu, 22 Sep 2022 15:02:22 +0100
Subject: [PATCH 18/46] Fix path to the 'validate' test data files

---
 tests/test_validate.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/test_validate.py b/tests/test_validate.py
index e00448e..7459f05 100644
--- a/tests/test_validate.py
+++ b/tests/test_validate.py
@@ -6,6 +6,7 @@
 
 
 log_file = 'VALIDATE.log'
+test_data_dir = './tests/data'
 
 
 ###### Formatted scoring files ######
@@ -109,40 +110,40 @@ def test_valid_formatted_file_pos_only_38(test_hmpos_file_GRCh38_3):
 
 @pytest.fixture
 def test_file_1():
-   return './data/test_scoring_file_1.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_1.txt.gz'
 
 @pytest.fixture
 def test_file_2():
-   return './data/test_scoring_file_2.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_2.txt.gz'
 
 @pytest.fixture
 def test_file_3():
-   return './data/test_scoring_file_3.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_3.txt.gz'
 
 @pytest.fixture
 def test_file_4():
-   return './data/test_scoring_file_4.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_4.txt.gz'
 
 @pytest.fixture
 def test_hmpos_file_GRCh37_1():
-   return './data/test_scoring_file_hmpos_37_1.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_hmpos_37_1.txt.gz'
 
 @pytest.fixture
 def test_hmpos_file_GRCh38_1():
-   return './data/test_scoring_file_hmpos_38_1.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_hmpos_38_1.txt.gz'
 
 @pytest.fixture
 def test_hmpos_file_GRCh37_2():
-   return './data/test_scoring_file_hmpos_37_2.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_hmpos_37_2.txt.gz'
 
 @pytest.fixture
 def test_hmpos_file_GRCh38_2():
-   return './data/test_scoring_file_hmpos_38_2.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_hmpos_38_2.txt.gz'
 
 @pytest.fixture
 def test_hmpos_file_GRCh37_3():
-   return './data/test_scoring_file_hmpos_37_3.txt.gz'
+   return f'{test_data_dir}/test_scoring_file_hmpos_37_3.txt.gz'
 
 @pytest.fixture
 def test_hmpos_file_GRCh38_3():
-   return './data/test_scoring_file_hmpos_38_3.txt.gz'
\ No newline at end of file
+   return f'{test_data_dir}/test_scoring_file_hmpos_38_3.txt.gz'
\ No newline at end of file

From b60b0aa32b9a9ac459ca21f3e5153afbc7a85048 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Fri, 23 Sep 2022 09:54:28 +0100
Subject: [PATCH 19/46] Update the README file

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index d19c186..e0126ec 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ other users might find some of these tools helpful.
 in 'long' format
 * `match_variants`: Match target variants (bim or pvar files) against the output
 of `combine_scorefile` to produce scoring files for plink 2
+* `validate_scorefiles`: Check/validate that the scoring files and harmonized scoring files match the PGS Catalog scoring file formats.
 
 ## Installation
 
@@ -26,6 +27,7 @@ $ pip install pgscatalog-utils
 $ download_scorefiles -i PGS000922 PGS001229 -o . -b GRCh37
 $ combine_scorefiles -s PGS*.txt.gz -o combined.txt 
 $ match_variants -s combined.txt -t <example.pvar> --min_overlap 0.75 --outdir .
+$ validate_scorefiles -t formatted --dir <scoringfiles_directory> --log_dir <logs_directory>
 ```
 
 More details are available using the `--help` parameter.

From 1881c17ddf978230d9eb1b84a33dc271c2da11e4 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 27 Sep 2022 11:30:31 +0100
Subject: [PATCH 20/46] fix df truthiness

---
 pgscatalog_utils/match/match.py      | 8 ++++----
 pgscatalog_utils/match/preprocess.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 677f22a..e0347b2 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -18,7 +18,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bo
                  'accession', 'effect_allele_FLIP', 'other_allele_FLIP',
                  'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type']
 
-    if scorefile_oa:
+    if not scorefile_oa.is_empty():
         logger.debug("Getting matches for scores with effect allele and other allele")
         matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt").select(col_order))
         matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref").select(col_order))
@@ -26,7 +26,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bo
             matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip").select(col_order))
             matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip").select(col_order))
 
-    if scorefile_no_oa:
+    if not scorefile_no_oa.is_empty():
         logger.debug("Getting matches for scores with effect allele only")
         matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref").select(col_order))
         matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt").select(col_order))
@@ -92,7 +92,7 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s
 
 def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]:
     """ Casting important columns to categorical makes polars fast """
-    if scorefile:
+    if not scorefile.is_empty():
         scorefile = scorefile.with_columns([
             pl.col("effect_allele").cast(pl.Categorical),
             pl.col("other_allele").cast(pl.Categorical),
@@ -101,7 +101,7 @@ def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]:
             pl.col("other_allele_FLIP").cast(pl.Categorical),
             pl.col("accession").cast(pl.Categorical)
         ])
-    if target:
+    if not target.is_empty():
         target = target.with_columns([
             pl.col("ID").cast(pl.Categorical),
             pl.col("REF").cast(pl.Categorical),
diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index 4d93090..206466f 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -35,14 +35,14 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format
         .otherwise(pl.lit(False))
         .alias('is_multiallelic')))
 
-    if df.select('is_multiallelic').sum() > 0:
+    if df.get_column('is_multiallelic').sum() > 0:
         logger.debug("Multiallelic variants detected")
         if remove_multiallelic:
             if file_format == "bim":
                 logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic "
                                "variant representations only")
             logger.debug('Dropping multiallelic variants')
-            return df.filter(~df['is_multiallelic'])
+            return df.filter(~df.get_column('is_multiallelic'))
         else:
             logger.debug("Exploding dataframe to handle multiallelic variants")
             df.replace('ALT', df['ALT'].str.split(by=','))  # turn ALT to list of variants

From 3bf62c398071e7737514fbedfd459d475259600e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 27 Sep 2022 11:32:39 +0100
Subject: [PATCH 21/46] fix bumped version

---
 Dockerfile                   | 4 ++--
 pgscatalog_utils/__init__.py | 2 +-
 pyproject.toml               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8c19690..0d42228 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,8 +11,8 @@ FROM python:3.10
 
 WORKDIR /opt/
 
-COPY --from=builder /app/dist/pgscatalog_utils-0.1.2-py3-none-any.whl .
+COPY --from=builder /app/dist/pgscatalog_utils-0.2.0-py3-none-any.whl .
 
-RUN pip install pgscatalog_utils-0.1.2-py3-none-any.whl
+RUN pip install pgscatalog_utils-0.2.0-py3-none-any.whl
 
 RUN apt-get update && apt-get install -y sqlite3
\ No newline at end of file
diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py
index 8ce9b36..7fd229a 100644
--- a/pgscatalog_utils/__init__.py
+++ b/pgscatalog_utils/__init__.py
@@ -1 +1 @@
-__version__ = '0.1.3'
+__version__ = '0.2.0'
diff --git a/pyproject.toml b/pyproject.toml
index 9c7bbb5..e362cb2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pgscatalog_utils"
-version = "0.1.3"
+version = "0.2.0"
 description = "Utilities for working with PGS Catalog API and scoring files"
 homepage = "https://github.com/PGScatalog/pgscatalog_utils"
 authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>"]

From f02c58c6b9af287521f68638bb82f4df5590d6c5 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 27 Sep 2022 11:36:27 +0100
Subject: [PATCH 22/46] batch process input to reduce memory usage

---
 pgscatalog_utils/target.py | 119 ++++++++++++++++++++++++++++++++-----
 1 file changed, 105 insertions(+), 14 deletions(-)

diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py
index 6b28998..22204f8 100644
--- a/pgscatalog_utils/target.py
+++ b/pgscatalog_utils/target.py
@@ -33,22 +33,113 @@ def from_path(cls, path):
 
         return cls(file_format=file_format, path=path, header=header, compressed=compressed)
 
-    def read(self) -> pl.DataFrame:
-        """ Read variant information into a polars df (expensive operation). Automatically handle compressed data.  """
-        # column_1 is always CHROM, which must always be a string or X/Y/MT/PAR will break inferred dtypes
-        logger.debug("Reading target into memory")
-        chrom_dtype = {'column_1': str}
+    # @profile
+    def read(self):
+        # this function is responsible for returning dfs allocated to contiguous memory, so manually rechunk
         if self.compressed:
-            with open(self.path, 'rb') as f:
-                dctx = zstandard.ZstdDecompressor()
-                with dctx.stream_reader(f) as reader:
-                    df: pl.DataFrame = pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', dtype=chrom_dtype)
-                    df.columns = self.header
-                    return df.select(_default_cols())
+            return self._read_compressed_chunks().rechunk()
         else:
-            df: pl.DataFrame = pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', dtype=chrom_dtype)
-            df.columns = self.header
-            return df.select(_default_cols())
+            batch_size = 10000000
+            n_rows_read = 0
+            df_lst = []
+            while True:
+                df_lst.append(self._read_batch(batch_size=batch_size, n_skip=n_rows_read))
+                n_rows_read = n_rows_read + batch_size
+
+                if df_lst[-1].shape[0] < batch_size:
+                    logger.debug("Finished reading final batch")
+                    break
+
+            return pl.concat(df_lst, rechunk=True)
+
+    def _read_batch(self, batch_size, n_skip):
+        logger.debug(f"{n_skip} target variants read, reading next batch")
+        assert not self.compressed
+        # TODO: lazy frame it
+        logger.debug("Reading uncompressed data")
+        return pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', n_threads=1,
+                           dtype=_get_col_dtypes(self.file_format),
+                           columns=_get_default_col_idx(self.file_format),
+                           new_columns=_default_cols(),
+                           rechunk=False,
+                           n_rows=batch_size,
+                           skip_rows_after_header=n_skip)
+
+    def _read_compressed_chunks(self):
+        logger.debug("Reading zstd compressed data")
+        df_lst = []
+        dtypes = _get_col_dtypes(self.file_format)
+        columns = _get_default_col_idx(self.file_format)
+        new_col_names = _default_cols()
+
+        with open(self.path, 'rb') as fh:
+            dctx = zstandard.ZstdDecompressor()
+            chunk_buffer = b''
+
+            # don't decode bytes stream to utf-8 with TextIOWrapper in python, polars + rust will be faster
+            for chunk in dctx.read_to_iter(fh, read_size=int(1e+8)):  # read 100MB of compressed data per chunk
+                if not chunk:
+                    break
+
+                end = chunk.rfind(b'\n') + 1  # only want to read complete rows
+                if chunk_buffer:
+                    row_chunk = b''.join([chunk_buffer, chunk[:end]])
+                    chunk_buffer = b''
+                else:
+                    row_chunk = chunk[:end]
+
+                df = pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#', n_threads=1,
+                                 dtype=dtypes,
+                                 columns=columns,
+                                 new_columns=new_col_names,
+                                 rechunk=False)
+                df_lst.append(df)
+                chunk_buffer = b''.join([chunk_buffer, chunk[end:]])
+
+        return pl.concat(df_lst, rechunk=False)
+
+
+def _get_default_col_idx(file_format):
+    # import default columns:
+    #  ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
+    match file_format:
+        case 'bim':
+            return [0, 1, 3, 4, 5]  # see _get_col_dtypes, dropping centimorgans
+        case 'pvar':
+            return [0, 1, 2, 3, 4]  # dropping QUAL FILTER INFO etc
+        case _:
+            logger.critical("Trying to get column idx for an invalid file format, TWENTY THREE NINETEEN")
+            raise Exception
+
+
+def _get_col_dtypes(file_format):
+    """ Manually set up categorical dtypes """
+    match file_format:
+        case 'bim':
+            # 1. Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name
+            # 2. Variant identifier
+            # 3. Position in morgans or centimorgans (safe to use dummy value of '0')
+            # 4. Base-pair coordinate (1-based; limited to 231-2)
+            # 5. Allele 1 (corresponding to clear bits in .bed; usually minor)
+            # 6. Allele 2 (corresponding to set bits in .bed; usually major)
+            d = {'column_1': pl.Categorical, 'column_2': str, 'column_3': pl.Float64, 'column_4': pl.UInt64,
+                 'column_5': pl.Categorical, 'column_6': pl.Categorical}
+        case 'pvar':
+            # 1. CHROM
+            # 2. POS (base-pair coordinate)
+            # 3. ID (variant ID; required)
+            # 4. REF (reference allele)
+            # 5. ALT (alternate alleles, comma-separated)
+            # 6. QUAL (phred-scaled quality score for whether the locus is variable at all)
+            # 7. FILTER ('PASS', '.', or semicolon-separated list of failing filter codes)
+            # 8. INFO (semicolon-separated list of flags and key-value pairs, with types declared in header)
+            d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Utf8, 'column_4': pl.Categorical,
+                 'column_5': pl.Utf8, 'column_6': pl.Float32, 'column_7': pl.Utf8, 'column_8': pl.Utf8}
+            # can't cast ALT to cat yet, because of multiallelic variants!
+        case _:
+            logger.critical("Trying to set header dtypes for an invalid file format, time to explode")
+            raise Exception
+    return d
 
 
 def _get_header(fh) -> tuple[str, list[str]]:

From ae8ce1446d9aacdea0f46fc8521ace5a92f0abf1 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 27 Sep 2022 16:41:59 +0100
Subject: [PATCH 23/46] read uncompressed data with a bufferedreader

---
 pgscatalog_utils/target.py | 98 ++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 47 deletions(-)

diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py
index 22204f8..ee62074 100644
--- a/pgscatalog_utils/target.py
+++ b/pgscatalog_utils/target.py
@@ -11,7 +11,6 @@
 class Target:
     """ Class to detect and read a plink1/plink2 variant information file """
     file_format: str = None
-    header: list[str] = None
     path: str = None
     compressed: bool = False
 
@@ -20,7 +19,7 @@ def from_path(cls, path):
         """ Create a Target object from a path. Cheaply detect file format and headers. """
         try:
             with open(path, 'r') as f:
-                file_format, header = _get_header(f)
+                file_format = _get_format(f)
                 compressed = False
         except UnicodeDecodeError:
             logger.error("Can't open target as a text file, so trying to read zstd compressed binary file")
@@ -28,44 +27,61 @@ def from_path(cls, path):
                 dctx = zstandard.ZstdDecompressor()
                 stream_reader = dctx.stream_reader(f)
                 text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
-                file_format, header = _get_header(text_stream)
+                file_format = _get_format(text_stream)
                 compressed = True
 
-        return cls(file_format=file_format, path=path, header=header, compressed=compressed)
+        return cls(file_format=file_format, path=path, compressed=compressed)
 
-    # @profile
+    #@profile
     def read(self):
-        # this function is responsible for returning dfs allocated to contiguous memory, so manually rechunk
         if self.compressed:
-            return self._read_compressed_chunks().rechunk()
+            df = self._read_compressed_chunks().rechunk().lazy()
+            return _filter_target(df)
         else:
-            batch_size = 10000000
-            n_rows_read = 0
-            df_lst = []
+            df = self._read_uncompressed_chunks().rechunk().lazy()
+            return _filter_target(df)
+
+    def _read_uncompressed_chunks(self):
+        """ Read a CSV using a BufferedIOReader. This is a bit slower than pl.read_csv() (30s vs 5s).
+
+        Lots of testing showed that lazy scanning and native polars reading used a lot of RAM, then freed a bunch.
+        Plotting RAM usage against time looked like a spiky hedgehog.
+
+        This function linearly consumes RAM in a more linear way by:
+            1. Reading a batch of lines
+            2. Dropping unused columns
+            3. Setting categorical dtypes on read
+            4. Don't rechunk until later
+        """
+        logger.debug("Reading uncompressed chunks")
+
+        df_lst = []
+        dtypes = _get_col_dtypes(self.file_format)
+        col_idxs = _get_default_col_idx(self.file_format)
+        new_col_names = _default_cols()
+
+        with open(self.path, "rb") as f:
             while True:
-                df_lst.append(self._read_batch(batch_size=batch_size, n_skip=n_rows_read))
-                n_rows_read = n_rows_read + batch_size
+                buffer = b''.join(f.readlines(int(1e6)))
 
-                if df_lst[-1].shape[0] < batch_size:
-                    logger.debug("Finished reading final batch")
+                if not buffer:
                     break
 
-            return pl.concat(df_lst, rechunk=True)
-
-    def _read_batch(self, batch_size, n_skip):
-        logger.debug(f"{n_skip} target variants read, reading next batch")
-        assert not self.compressed
-        # TODO: lazy frame it
-        logger.debug("Reading uncompressed data")
-        return pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', n_threads=1,
-                           dtype=_get_col_dtypes(self.file_format),
-                           columns=_get_default_col_idx(self.file_format),
-                           new_columns=_default_cols(),
-                           rechunk=False,
-                           n_rows=batch_size,
-                           skip_rows_after_header=n_skip)
+                df = (pl.read_csv(buffer, sep='\t', has_header=False, comment_char='#', n_threads=1,
+                                  dtype=dtypes,
+                                  columns=col_idxs,
+                                  new_columns=new_col_names,
+                                  rechunk=False))
+
+                df_lst.append(df)
+
+        return pl.concat(df_lst, rechunk=False)
 
     def _read_compressed_chunks(self):
+        """ Like _read_uncompressed_chunks, but read chunks of bytes and handle incomplete rows
+
+        zstd returns chunks of bytes, not lines, but encoding utf-8 will be faster in rust and polars
+         """
         logger.debug("Reading zstd compressed data")
         df_lst = []
         dtypes = _get_col_dtypes(self.file_format)
@@ -76,7 +92,6 @@ def _read_compressed_chunks(self):
             dctx = zstandard.ZstdDecompressor()
             chunk_buffer = b''
 
-            # don't decode bytes stream to utf-8 with TextIOWrapper in python, polars + rust will be faster
             for chunk in dctx.read_to_iter(fh, read_size=int(1e+8)):  # read 100MB of compressed data per chunk
                 if not chunk:
                     break
@@ -113,7 +128,7 @@ def _get_default_col_idx(file_format):
 
 
 def _get_col_dtypes(file_format):
-    """ Manually set up categorical dtypes """
+    """ Manually set up dtypes. pl.Categorical saves a lot of RAM vs pl.Utf8 """
     match file_format:
         case 'bim':
             # 1. Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name
@@ -142,36 +157,25 @@ def _get_col_dtypes(file_format):
     return d
 
 
-def _get_header(fh) -> tuple[str, list[str]]:
-    header = None
+def _get_format(fh) -> str:
     file_format = None
-    logger.debug(f"Scanning header to get file format and column names")
+    logger.debug(f"Scanning header to get file format")
     for line in fh:
         if line.startswith('#'):
             logger.debug("pvar format detected")
             file_format = 'pvar'
-            header = _pvar_header(fh)
             break
         else:
             logger.debug("bim format detected")
             file_format = 'bim'
-            header = _bim_header()
             break
 
-    return file_format, header
-
+    return file_format
 
-def _pvar_header(fh) -> list[str]:
-    """ Get the column names from the pvar file (not constrained like bim, especially when converted from VCF) """
-    line: str = '#'
-    while line.startswith('#'):
-        line: str = fh.readline()
-        if line.startswith('#CHROM'):
-            return line.strip().split('\t')
 
-
-def _bim_header() -> list[str]:
-    return ['#CHROM', 'ID', 'CM', 'POS', 'REF', 'ALT']
+def _default_cols() -> list[str]:
+    """ Standardise column names in a target genome """
+    return ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
 
 
 def _default_cols() -> list[str]:

From 3bb3e3d89a05302d58a83bdee76bc02dec04f4a5 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 28 Sep 2022 17:15:58 +0100
Subject: [PATCH 24/46] add lazy evaluation

---
 pgscatalog_utils/match/filter.py         | 16 +++----
 pgscatalog_utils/match/label.py          |  2 +-
 pgscatalog_utils/match/log.py            | 24 +++++------
 pgscatalog_utils/match/match.py          | 55 ++++++++----------------
 pgscatalog_utils/match/match_variants.py |  4 +-
 pgscatalog_utils/match/preprocess.py     | 27 +++++++++---
 pgscatalog_utils/match/read.py           | 31 +++++++++----
 pgscatalog_utils/match/write.py          |  8 ++--
 pgscatalog_utils/target.py               | 21 +++++----
 9 files changed, 97 insertions(+), 91 deletions(-)

diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py
index c47a449..c2d0364 100644
--- a/pgscatalog_utils/match/filter.py
+++ b/pgscatalog_utils/match/filter.py
@@ -5,14 +5,14 @@
 logger = logging.getLogger(__name__)
 
 
-def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float,
-                  dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
+def filter_scores(scorefile: pl.LazyFrame, matches: pl.LazyFrame, min_overlap: float,
+                  dataset: str) -> tuple[pl.LazyFrame, pl.LazyFrame]:
     """ Check overlap between filtered matches and scorefile, remove scores that don't match well and report stats """
-    filtered_matches: pl.DataFrame = _filter_matches(matches)
-    match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset)
+    filtered_matches: pl.LazyFrame = _filter_matches(matches)
+    match_log: pl.LazyFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset)
                                .with_columns(pl.col('best_match').fill_null(False)))
 
-    fail_rates: pl.DataFrame = _calculate_match_rate(match_log)
+    fail_rates: pl.DataFrame = _calculate_match_rate(match_log).collect()  # collect for iteration
 
     scores: list[pl.DataFrame] = []
     for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()):
@@ -25,7 +25,7 @@ def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: f
             logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)")
             scores.append(df.with_column(pl.col('accession').cast(pl.Categorical)))
 
-    score_summary: pl.DataFrame = pl.concat(scores)
+    score_summary: pl.LazyFrame = pl.concat(scores).lazy()
     filtered_scores: pl.DataFrame = (filtered_matches.join(score_summary, on='accession', how='left')
                                      .filter(pl.col('score_pass') == True))
 
@@ -39,12 +39,12 @@ def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame:
             .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate')))
 
 
-def _filter_matches(df: pl.DataFrame) -> pl.DataFrame:
+def _filter_matches(df: pl.LazyFrame) -> pl.LazyFrame:
     logger.debug("Filtering variants with exclude flag")
     return df.filter((pl.col('best_match') == True) & (pl.col('exclude') == False))
 
 
-def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame:
+def _join_filtered_matches(matches: pl.LazyFrame, scorefile: pl.LazyFrame, dataset: str) -> pl.LazyFrame:
     return (scorefile.join(matches, on=['row_nr', 'accession'], how='left')
             .with_column(pl.lit(dataset).alias('dataset'))
             .select(pl.exclude("^.*_right$")))
diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 0d38ccb..ad7423c 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -60,7 +60,7 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
                                               .then(pl.lit(True))
                                               .otherwise(pl.lit(False))
                                               .alias('best_match')))
-    assert prioritised.shape[0] == df.shape[0]  # I'm watching you, Wazowski. Always watching. Always.
+
     return prioritised.drop(['match_priority', 'best_match_type'])
 
 
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 91f3999..3b4686c 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -5,7 +5,7 @@
 logger = logging.getLogger(__name__)
 
 
-def make_logs(scorefile, match_candidates, filter_summary, dataset):
+def make_logs(scorefile: pl.LazyFrame, match_candidates: pl.LazyFrame, filter_summary: pl.LazyFrame, dataset: str):
     # summary log -> aggregated from best matches (one per scoring file line)
     # big log -> unaggregated, written to compressed gzip, possibly multiple matches per scoring file line
     summary_log, big_log = _join_match_candidates(scorefile=scorefile, matches=match_candidates,
@@ -13,29 +13,29 @@ def make_logs(scorefile, match_candidates, filter_summary, dataset):
                                                   dataset=dataset)
 
     # make sure the aggregated best log matches the scoring file accession line count
-    summary_count = (summary_log.groupby(pl.col('accession'))
+    summary_count: pl.LazyFrame = (summary_log.groupby(pl.col('accession'))
                      .agg(pl.sum('count')))
-    log_count = (scorefile.groupby("accession")
-                 .count()
-                 .join(summary_count, on='accession'))
+    log_count: pl.DataFrame = (scorefile.groupby("accession")
+                 .agg(pl.count())
+                 .join(summary_count, on='accession')).collect()
 
-    assert (log_count['count'] == log_count['count_right']).all(), "Log doesn't match input scoring file"
+    assert (log_count.get_column('count') == log_count.get_column('count_right')).all(), "Log doesn't match input scoring file"
     logger.debug("Log matches input scoring file")
 
     return _prettify_log(big_log), _prettify_summary(summary_log)
 
 
-def make_summary_log(best_matches, filter_summary):
+def make_summary_log(best_matches: pl.LazyFrame, filter_summary: pl.LazyFrame) -> pl.LazyFrame:
     """ Make an aggregated table """
     logger.debug("Aggregating best match log into a summary table")
     return (best_matches
             .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate_best_match',
                       'duplicate_ID'])
-            .count()
+            .agg(pl.count())
             .join(filter_summary, how='left', on='accession'))
 
 
-def _prettify_summary(df: pl.DataFrame):
+def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
     keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic",
                  "duplicate_best_match", "duplicate_ID", "count", "percent"]
     return (df.with_column((pl.col("count") / pl.sum("count") * 100)
@@ -44,7 +44,7 @@ def _prettify_summary(df: pl.DataFrame):
             .select(keep_cols))
 
 
-def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
+def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame:
     keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight",
                  "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
                  "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"]
@@ -54,8 +54,8 @@ def _prettify_log(df: pl.DataFrame) -> pl.DataFrame:
     return pretty_df
 
 
-def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, filter_summary: pl.DataFrame,
-                           dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]:
+def _join_match_candidates(scorefile: pl.LazyFrame, matches: pl.LazyFrame, filter_summary: pl.LazyFrame,
+                           dataset: str) -> tuple[pl.LazyFrame, pl.LazyFrame]:
     """ Join match candidates against the original scoring file """
     logger.debug("Making big logs")
 
diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index e0347b2..aedf941 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -9,32 +9,31 @@
 
 def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool, remove_ambiguous: bool,
                     keep_first_match: bool) -> pl.DataFrame:
-    scorefile_cat, target_cat = _cast_categorical(scorefile, target)
-    scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None)
-    scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None)
+    scorefile_oa = scorefile.filter(pl.col("other_allele") != None)
+    scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None)
 
     matches: list[pl.DataFrame] = []
     col_order = ['row_nr', 'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
                  'accession', 'effect_allele_FLIP', 'other_allele_FLIP',
                  'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type']
 
-    if not scorefile_oa.is_empty():
-        logger.debug("Getting matches for scores with effect allele and other allele")
-        matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt").select(col_order))
-        matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref").select(col_order))
-        if skip_flip is False:
-            matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip").select(col_order))
-            matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip").select(col_order))
+    logger.debug("Getting matches for scores with effect allele and other allele")
+    matches.append(_match_variants(scorefile=scorefile_oa, target=target, match_type="refalt").select(col_order))
+    matches.append(_match_variants(scorefile_oa, target, match_type="altref").select(col_order))
+    if skip_flip is False:
+        matches.append(_match_variants(scorefile_oa, target, match_type="refalt_flip").select(col_order))
+        matches.append(_match_variants(scorefile_oa, target, match_type="altref_flip").select(col_order))
 
-    if not scorefile_no_oa.is_empty():
-        logger.debug("Getting matches for scores with effect allele only")
-        matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref").select(col_order))
-        matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt").select(col_order))
-        if skip_flip is False:
-            matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order))
-            matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order))
+    logger.debug("Getting matches for scores with effect allele only")
+    matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref").select(col_order))
+    matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt").select(col_order))
+    if skip_flip is False:
+        matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order))
+        matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order))
 
-    return pl.concat(matches).pipe(label_matches, remove_ambiguous, keep_first_match)
+    # manually collect to avoid concat error TODO: try to reproduce and file a bug report
+    logger.debug("Collecting all matches (parallel)")
+    return pl.concat(pl.collect_all(matches)).lazy().pipe(label_matches, remove_ambiguous, keep_first_match)
 
 
 def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame:
@@ -89,23 +88,3 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s
                            pl.lit(match_type).alias("match_type")])
             .join(target.select(join_cols), on="ID", how="inner"))  # get REF / ALT back after first join
 
-
-def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]:
-    """ Casting important columns to categorical makes polars fast """
-    if not scorefile.is_empty():
-        scorefile = scorefile.with_columns([
-            pl.col("effect_allele").cast(pl.Categorical),
-            pl.col("other_allele").cast(pl.Categorical),
-            pl.col("effect_type").cast(pl.Categorical),
-            pl.col("effect_allele_FLIP").cast(pl.Categorical),
-            pl.col("other_allele_FLIP").cast(pl.Categorical),
-            pl.col("accession").cast(pl.Categorical)
-        ])
-    if not target.is_empty():
-        target = target.with_columns([
-            pl.col("ID").cast(pl.Categorical),
-            pl.col("REF").cast(pl.Categorical),
-            pl.col("ALT").cast(pl.Categorical)
-        ])
-
-    return scorefile, target
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index dd4ec4e..64311ee 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -50,14 +50,14 @@ def match_variants():
         valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=matches, dataset=dataset,
                                                       min_overlap=args.min_overlap)
 
-        if valid_matches.is_empty():  # this can happen if args.min_overlap = 0
+        if valid_matches.fetch().is_empty():  # this can happen if args.min_overlap = 0
             logger.error("Error: no target variants match any variants in scoring files")
             raise Exception
 
         big_log, summary_log = make_logs(scorefile, matches, filter_summary, args.dataset)
 
         write_log(big_log, prefix=dataset)
-        summary_log.write_csv(f"{dataset}_summary.csv")
+        summary_log.collect().write_csv(f"{dataset}_summary.csv")
         write_out(valid_matches, args.split, args.outdir, dataset)
 
 
diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index 206466f..d7b1e86 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -2,9 +2,21 @@
 
 import polars as pl
 
+from pgscatalog_utils.target import logger
+
 logger = logging.getLogger(__name__)
 
 
+def filter_target(df):
+    """ Remove variants that won't be matched against the scorefile
+
+    Chromosomes 1 - 22, X, and Y with an efficient join. Remmove variants with missing identifiers also
+    """
+    logger.debug("Filtering target to include chromosomes 1 - 22, X, Y")
+    chroms = [str(x) for x in list(range(1, 23)) + ['X', 'Y']]
+    return df.filter((pl.col('#CHROM').is_in(chroms)) & (pl.col('ID') != '.'))
+
+
 def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataFrame:
     """ Improved function to complement alleles. Will only complement sequences that are valid DNA.
     """
@@ -27,7 +39,7 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF
     return df
 
 
-def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format: str) -> pl.DataFrame:
+def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataFrame:
     # plink2 pvar multi-alleles are comma-separated
     df: pl.DataFrame = (df.with_column(
         pl.when(pl.col("ALT").str.contains(','))
@@ -35,14 +47,15 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format
         .otherwise(pl.lit(False))
         .alias('is_multiallelic')))
 
-    if df.get_column('is_multiallelic').sum() > 0:
+    multiallelic_canary = (df.filter(pl.col('is_multiallelic') == True)
+                           .limit(1)  # just detect the first occurring
+                           .collect())
+
+    if not multiallelic_canary.is_empty():
         logger.debug("Multiallelic variants detected")
         if remove_multiallelic:
-            if file_format == "bim":
-                logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic "
-                               "variant representations only")
             logger.debug('Dropping multiallelic variants')
-            return df.filter(~df.get_column('is_multiallelic'))
+            return df.filter(pl.col('is_multiallelic') == False)
         else:
             logger.debug("Exploding dataframe to handle multiallelic variants")
             df.replace('ALT', df['ALT'].str.split(by=','))  # turn ALT to list of variants
@@ -55,3 +68,5 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format
 def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame:
     df.with_column(
         pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic'))
+
+
diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index c25175a..91e55a6 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -3,7 +3,7 @@
 
 import polars as pl
 
-from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles
+from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target
 from pgscatalog_utils.target import Target
 
 logger = logging.getLogger(__name__)
@@ -23,18 +23,31 @@ def read_target(path: str, remove_multiallelic: bool) -> pl.DataFrame:
     dfs: list[pl.DataFrame] = []
     for target in targets:
         assert target.file_format in ['bim', 'pvar']
-        dfs.append(target.read().pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic,
-                                      file_format=target.file_format))
+        dfs.append(target.read())
 
-    return pl.concat(dfs).filter(pl.col("ID") != '.')
+    logger.debug("Reading all target data complete")
+    # explicitly rechunk now, because reading is complete and the input data were read unchunked to save memory
+    # only pipe functions once rechunking has happened to improve speed
+    # handling multiallelic requires str methods, so don't forget to cast back or matching will break
+    return (pl.concat(dfs, rechunk=True)
+            .pipe(filter_target)
+            .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic)
+            .with_column(pl.col('ALT').cast(pl.Categorical)))
 
 
 def read_scorefile(path: str) -> pl.DataFrame:
     logger.debug("Reading scorefile")
-    scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str})
+    dtypes = {'chr_name': pl.Categorical,
+              'chr_position': pl.UInt64,
+              'effect_allele': pl.Utf8,  # str functions required to complement
+              'other_allele': pl.Utf8,
+              'effect_type': pl.Categorical,
+              'accession': pl.Categorical}
+    return (pl.scan_csv(path, sep='\t', dtype=dtypes)
     .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])
     .with_columns([
-        pl.col('accession').cast(pl.Categorical),
-        pl.col("effect_type").cast(pl.Categorical)]))
-
-    return scorefile
+        pl.col("effect_allele").cast(pl.Categorical),
+        pl.col("other_allele").cast(pl.Categorical),
+        pl.col("effect_allele_FLIP").cast(pl.Categorical),
+        pl.col("other_allele_FLIP").cast(pl.Categorical)
+    ]))
diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py
index 52253a3..9d4ba92 100644
--- a/pgscatalog_utils/match/write.py
+++ b/pgscatalog_utils/match/write.py
@@ -7,18 +7,18 @@
 logger = logging.getLogger(__name__)
 
 
-def write_log(df: pl.DataFrame, prefix: str) -> None:
+def write_log(df: pl.LazyFrame, prefix: str) -> None:
     logger.debug(f"Compressing and writing log: {prefix}_log.csv.gz")
     with gzip.open(f"{prefix}_log.csv.gz", 'wb') as f:
-        df.write_csv(f)
+        df.collect().write_csv(f)
 
 
-def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None:
+def write_out(df: pl.LazyFrame, split: bool, outdir: str, dataset: str) -> None:
     if not os.path.isdir(outdir):
         os.mkdir(outdir)
 
     logger.debug("Splitting by effect type")
-    effect_types: dict[str, pl.DataFrame] = _split_effect_type(df)
+    effect_types: dict[str, pl.DataFrame] = _split_effect_type(df.collect())
 
     logger.debug("Deduplicating variants")
     deduplicated: dict[str, pl.DataFrame] = {k: _deduplicate_variants(k, v) for k, v in effect_types.items()}
diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py
index ee62074..c3fa792 100644
--- a/pgscatalog_utils/target.py
+++ b/pgscatalog_utils/target.py
@@ -35,11 +35,9 @@ def from_path(cls, path):
     #@profile
     def read(self):
         if self.compressed:
-            df = self._read_compressed_chunks().rechunk().lazy()
-            return _filter_target(df)
+            return self._read_compressed_chunks().lazy()
         else:
-            df = self._read_uncompressed_chunks().rechunk().lazy()
-            return _filter_target(df)
+            return self._read_uncompressed_chunks().lazy()
 
     def _read_uncompressed_chunks(self):
         """ Read a CSV using a BufferedIOReader. This is a bit slower than pl.read_csv() (30s vs 5s).
@@ -53,7 +51,7 @@ def _read_uncompressed_chunks(self):
             3. Setting categorical dtypes on read
             4. Don't rechunk until later
         """
-        logger.debug("Reading uncompressed chunks")
+        logger.debug("Started reading uncompressed chunks")
 
         df_lst = []
         dtypes = _get_col_dtypes(self.file_format)
@@ -75,6 +73,8 @@ def _read_uncompressed_chunks(self):
 
                 df_lst.append(df)
 
+        logger.debug("Finished reading uncompressed chunks")
+        logger.debug("Concatenating chunked data frames")
         return pl.concat(df_lst, rechunk=False)
 
     def _read_compressed_chunks(self):
@@ -82,7 +82,7 @@ def _read_compressed_chunks(self):
 
         zstd returns chunks of bytes, not lines, but encoding utf-8 will be faster in rust and polars
          """
-        logger.debug("Reading zstd compressed data")
+        logger.debug("Started reading zstd compressed data")
         df_lst = []
         dtypes = _get_col_dtypes(self.file_format)
         columns = _get_default_col_idx(self.file_format)
@@ -111,6 +111,8 @@ def _read_compressed_chunks(self):
                 df_lst.append(df)
                 chunk_buffer = b''.join([chunk_buffer, chunk[end:]])
 
+        logger.debug("Finished reading zstd compressed chunks")
+        logger.debug("Concatenating chunked data frames")
         return pl.concat(df_lst, rechunk=False)
 
 
@@ -137,7 +139,7 @@ def _get_col_dtypes(file_format):
             # 4. Base-pair coordinate (1-based; limited to 231-2)
             # 5. Allele 1 (corresponding to clear bits in .bed; usually minor)
             # 6. Allele 2 (corresponding to set bits in .bed; usually major)
-            d = {'column_1': pl.Categorical, 'column_2': str, 'column_3': pl.Float64, 'column_4': pl.UInt64,
+            d = {'column_1': pl.Categorical, 'column_2': pl.Categorical, 'column_3': pl.Float64, 'column_4': pl.UInt64,
                  'column_5': pl.Categorical, 'column_6': pl.Categorical}
         case 'pvar':
             # 1. CHROM
@@ -148,7 +150,7 @@ def _get_col_dtypes(file_format):
             # 6. QUAL (phred-scaled quality score for whether the locus is variable at all)
             # 7. FILTER ('PASS', '.', or semicolon-separated list of failing filter codes)
             # 8. INFO (semicolon-separated list of flags and key-value pairs, with types declared in header)
-            d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Utf8, 'column_4': pl.Categorical,
+            d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Categorical, 'column_4': pl.Categorical,
                  'column_5': pl.Utf8, 'column_6': pl.Float32, 'column_7': pl.Utf8, 'column_8': pl.Utf8}
             # can't cast ALT to cat yet, because of multiallelic variants!
         case _:
@@ -177,6 +179,3 @@ def _default_cols() -> list[str]:
     """ Standardise column names in a target genome """
     return ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
 
-
-def _default_cols() -> list[str]:
-    return ['#CHROM', 'POS', 'ID', 'REF', 'ALT']  # only columns we want from a target genome

From a559d76416e8291f804ef7802663a504dce0279b Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 30 Sep 2022 14:14:44 +0100
Subject: [PATCH 25/46] improve RAM usage

---
 .../aggregate/aggregate_scores.py             |   2 +-
 pgscatalog_utils/{log_config.py => config.py} |   6 +
 .../download/download_scorefile.py            |   2 +-
 pgscatalog_utils/match/label.py               |   2 -
 pgscatalog_utils/match/log.py                 |   9 +-
 pgscatalog_utils/match/match.py               |  40 +++-
 pgscatalog_utils/match/match_variants.py      |  69 ++++---
 pgscatalog_utils/match/preprocess.py          |   4 -
 pgscatalog_utils/match/read.py                |  27 ++-
 .../scorefile/combine_scorefiles.py           |   2 +-
 pgscatalog_utils/target.py                    | 176 +++++++++++-------
 poetry.lock                                   |  77 ++------
 12 files changed, 233 insertions(+), 183 deletions(-)
 rename pgscatalog_utils/{log_config.py => config.py} (70%)

diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py
index 2787680..6109a7f 100644
--- a/pgscatalog_utils/aggregate/aggregate_scores.py
+++ b/pgscatalog_utils/aggregate/aggregate_scores.py
@@ -3,7 +3,7 @@
 
 import pandas as pd
 
-from pgscatalog_utils.log_config import set_logging_level
+from pgscatalog_utils.config import set_logging_level
 import glob
 import logging
 
diff --git a/pgscatalog_utils/log_config.py b/pgscatalog_utils/config.py
similarity index 70%
rename from pgscatalog_utils/log_config.py
rename to pgscatalog_utils/config.py
index dcd9cbe..8bb2a57 100644
--- a/pgscatalog_utils/log_config.py
+++ b/pgscatalog_utils/config.py
@@ -1,4 +1,10 @@
 import logging
+import os
+
+try:
+    POLARS_MAX_THREADS: int = int(os.getenv('POLARS_MAX_THREADS'))
+except TypeError:
+    POLARS_MAX_THREADS = 1  # not defined, it's better to be slow than set to n_cores (polars default)
 
 
 def set_logging_level(verbose: bool):
diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py
index fc35529..c12467e 100644
--- a/pgscatalog_utils/download/download_scorefile.py
+++ b/pgscatalog_utils/download/download_scorefile.py
@@ -10,7 +10,7 @@
 from pgscatalog_utils.download.publication import query_publication
 from pgscatalog_utils.download.score import get_url
 from pgscatalog_utils.download.trait import query_trait
-from pgscatalog_utils.log_config import set_logging_level
+from pgscatalog_utils.config import set_logging_level
 
 logger = logging.getLogger(__name__)
 
diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index ad7423c..072fbb1 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -175,5 +175,3 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra
                 .with_column(pl.max(["exclude", "exclude_ambiguous"]))
                 .drop(["exclude", "exclude_ambiguous"])
                 .rename({"max": "exclude"}))
-
-
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 3b4686c..ac44084 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -14,12 +14,13 @@ def make_logs(scorefile: pl.LazyFrame, match_candidates: pl.LazyFrame, filter_su
 
     # make sure the aggregated best log matches the scoring file accession line count
     summary_count: pl.LazyFrame = (summary_log.groupby(pl.col('accession'))
-                     .agg(pl.sum('count')))
+                                   .agg(pl.sum('count')))
     log_count: pl.DataFrame = (scorefile.groupby("accession")
-                 .agg(pl.count())
-                 .join(summary_count, on='accession')).collect()
+                               .agg(pl.count())
+                               .join(summary_count, on='accession')).collect()
 
-    assert (log_count.get_column('count') == log_count.get_column('count_right')).all(), "Log doesn't match input scoring file"
+    assert (log_count.get_column('count') == log_count.get_column(
+        'count_right')).all(), "Log doesn't match input scoring file"
     logger.debug("Log matches input scoring file")
 
     return _prettify_log(big_log), _prettify_summary(summary_log)
diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index aedf941..7a9e0f3 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -1,4 +1,7 @@
+import gc
 import logging
+import os
+from tempfile import TemporaryDirectory
 
 import polars as pl
 
@@ -7,12 +10,12 @@
 logger = logging.getLogger(__name__)
 
 
-def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool, remove_ambiguous: bool,
-                    keep_first_match: bool) -> pl.DataFrame:
+def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bool, remove_ambiguous: bool,
+                    keep_first_match: bool, low_memory: bool) -> pl.DataFrame:
     scorefile_oa = scorefile.filter(pl.col("other_allele") != None)
     scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None)
 
-    matches: list[pl.DataFrame] = []
+    matches: list[pl.LazyFrame()] = []
     col_order = ['row_nr', 'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
                  'accession', 'effect_allele_FLIP', 'other_allele_FLIP',
                  'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type']
@@ -31,12 +34,34 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bo
         matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order))
         matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order))
 
-    # manually collect to avoid concat error TODO: try to reproduce and file a bug report
-    logger.debug("Collecting all matches (parallel)")
-    return pl.concat(pl.collect_all(matches)).lazy().pipe(label_matches, remove_ambiguous, keep_first_match)
+    if low_memory:
+        logger.debug("Batch collecting matches (low memory mode)")
+        match_lf = _batch_collect(matches)
+    else:
+        logger.debug("Collecting all matches (parallel)")
+        match_lf = pl.concat(pl.collect_all(matches))
 
+    return match_lf.pipe(label_matches, remove_ambiguous, keep_first_match)
 
-def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame:
+
+def _batch_collect(matches: list[pl.LazyFrame]):
+    """ A slower alternative to pl.collect_all(), but this approach will use less peak memory
+
+    This batches the .collect() and writes intermediate results to a temporary working directory
+
+    IPC files are binary and remember column schema. Reading them can be extremely fast. """
+    with TemporaryDirectory() as temp_dir:
+        n_chunks = 0
+        for i, match in enumerate(matches):
+            out_path = os.path.join(temp_dir, str(i) + ".ipc")
+            match.collect().write_ipc(out_path)
+            n_chunks += 1
+        logger.debug(f"Staged {n_chunks} match chunks to {temp_dir}")
+        gc.collect()
+        return pl.read_ipc(os.path.join(temp_dir, "*.ipc")).lazy()
+
+
+def _match_variants(scorefile: pl.LazyFrame, target: pl.LazyFrame, match_type: str) -> pl.LazyFrame:
     logger.debug(f"Matching strategy: {match_type}")
     match match_type:
         case 'refalt':
@@ -87,4 +112,3 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s
                            pl.col(effect_allele_column).alias("matched_effect_allele"),
                            pl.lit(match_type).alias("match_type")])
             .join(target.select(join_cols), on="ID", how="inner"))  # get REF / ALT back after first join
-
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 64311ee..e85d154 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -1,11 +1,12 @@
 import argparse
 import logging
+import os
 import textwrap
 from glob import glob
 
 import polars as pl
 
-from pgscatalog_utils.log_config import set_logging_level
+from pgscatalog_utils.config import set_logging_level, POLARS_MAX_THREADS
 from pgscatalog_utils.match.filter import filter_scores
 from pgscatalog_utils.match.log import make_logs
 from pgscatalog_utils.match.match import get_all_matches
@@ -19,29 +20,40 @@ def match_variants():
     args = _parse_args()
 
     set_logging_level(args.verbose)
-
-    logger.debug(f"polars n_threads: {pl.threadpool_size()}")
+    logger.debug(f"POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}")
+    logger.debug(f"polars threadpool size: {pl.threadpool_size()}")
+    logger.debug(f"Using {POLARS_MAX_THREADS} threads to read CSVs")
 
     with pl.StringCache():
-        scorefile: pl.DataFrame = read_scorefile(path=args.scorefile)
-
+        scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile)
         n_target_files = len(glob(args.target))
         matches: pl.DataFrame
 
-        if n_target_files > 1 and not args.fast:
+        if n_target_files == 1 and not args.fast:
+            low_memory: bool = True
+            match_mode: str = 'single'
+        elif n_target_files > 1 and not args.fast:
+            low_memory: bool = True
             match_mode: str = 'multi'
-        else:
+        elif args.fast:
+            low_memory: bool = False
             match_mode: str = 'fast'
 
         match match_mode:
+            case "single":
+                logger.debug(f"Match mode: {match_mode}")  # read one target in chunks
+                matches: pl.LazyFrame = _match_single_target(args.target, scorefile, args.remove_multiallelic,
+                                                             args.skip_flip, args.remove_ambiguous,
+                                                             args.keep_first_match, low_memory)
             case "multi":
-                logger.debug(f"Match mode: {match_mode}")
-                matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
-                                                  args.remove_ambiguous, args.keep_first_match)
+                logger.debug(f"Match mode: {match_mode}")  # iterate over multiple targets, in chunks
+                matches: pl.LazyFrame = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic,
+                                                                args.skip_flip, args.remove_ambiguous,
+                                                                args.keep_first_match, low_memory)
             case "fast":
-                logger.debug(f"Match mode: {match_mode}")
-                matches = _fast_match(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
-                                      args.remove_ambiguous, args.keep_first_match)
+                logger.debug(f"Match mode: {match_mode}")  # just read everything into memory for speed
+                matches: pl.LazyFrame = _fast_match(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
+                                                    args.remove_ambiguous, args.keep_first_match, low_memory)
             case _:
                 logger.critical(f"Invalid match mode: {match_mode}")
                 raise Exception
@@ -61,8 +73,8 @@ def match_variants():
         write_out(valid_matches, args.split, args.outdir, dataset)
 
 
-def _check_target_chroms(target) -> None:
-    chroms: list[str] = target['#CHROM'].unique().to_list()
+def _check_target_chroms(target: pl.LazyFrame) -> None:
+    chroms: list[str] = target.select(pl.col("#CHROM").unique()).collect().get_column("#CHROM").to_list()
     if len(chroms) > 1:
         logger.critical(f"Multiple chromosomes detected: {chroms}. Check input data.")
         raise Exception
@@ -70,25 +82,34 @@ def _check_target_chroms(target) -> None:
         logger.debug("Split target genome contains one chromosome (good)")
 
 
-def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
+def _fast_match(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool,
+                skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, low_memory: bool) -> pl.LazyFrame:
     # fast match is fast because:
     #   1) all target files are read into memory
     #   2) matching occurs without iterating through chromosomes
-    target: pl.DataFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic)
+    target: pl.LazyFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic, low_memory=low_memory)
     logger.debug("Split target chromosomes not checked with fast match mode")
-    return get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match)
+    return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy()
+
+
+def _match_single_target(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool,
+                         skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool,
+                         low_memory: bool) -> pl.LazyFrame:
+    target: pl.LazyFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic, low_memory=low_memory)
+    return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy()
 
 
-def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool,
-                            skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame:
+def _match_multiple_targets(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool,
+                            skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool,
+                            low_memory: bool) -> pl.LazyFrame:
     matches = []
     for i, loc_target_current in enumerate(glob(target_path)):
         logger.debug(f'Matching scorefile(s) against target: {loc_target_current}')
-        target: pl.DataFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic)
+        target: pl.LazyFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic,
+                                           low_memory=low_memory)
         _check_target_chroms(target)
-        matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match))
-    return pl.concat(matches)
+        matches.append(get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory))
+    return pl.concat(matches).lazy()
 
 
 def _description_text() -> str:
diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index d7b1e86..3f0c38d 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -2,8 +2,6 @@
 
 import polars as pl
 
-from pgscatalog_utils.target import logger
-
 logger = logging.getLogger(__name__)
 
 
@@ -68,5 +66,3 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataF
 def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame:
     df.with_column(
         pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic'))
-
-
diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index 91e55a6..22271cf 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -3,39 +3,35 @@
 
 import polars as pl
 
+from pgscatalog_utils.config import POLARS_MAX_THREADS
 from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target
 from pgscatalog_utils.target import Target
 
 logger = logging.getLogger(__name__)
 
 
-def read_target(path: str, remove_multiallelic: bool) -> pl.DataFrame:
+def read_target(path: str, remove_multiallelic: bool, low_memory: bool) -> pl.LazyFrame:
     """ Read one or more targets from a path (may contain a wildcard) """
 
     if '*' in path:
         logger.debug("Wildcard detected in target path: finding all matching files")
         paths: list[str] = glob.glob(path)
     else:
-        logger.debug("")
+        logger.debug("Found one matching target")
         paths: list[str] = [path]
 
-    targets: list[Target] = [Target.from_path(x) for x in paths]
-    dfs: list[pl.DataFrame] = []
-    for target in targets:
-        assert target.file_format in ['bim', 'pvar']
-        dfs.append(target.read())
+    targets: list[Target] = [Target.from_path(x, low_memory) for x in paths]
 
     logger.debug("Reading all target data complete")
-    # explicitly rechunk now, because reading is complete and the input data were read unchunked to save memory
-    # only pipe functions once rechunking has happened to improve speed
     # handling multiallelic requires str methods, so don't forget to cast back or matching will break
-    return (pl.concat(dfs, rechunk=True)
+    return (pl.concat([x.read() for x in targets])
+            .lazy()
             .pipe(filter_target)
             .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic)
             .with_column(pl.col('ALT').cast(pl.Categorical)))
 
 
-def read_scorefile(path: str) -> pl.DataFrame:
+def read_scorefile(path: str) -> pl.LazyFrame:
     logger.debug("Reading scorefile")
     dtypes = {'chr_name': pl.Categorical,
               'chr_position': pl.UInt64,
@@ -43,11 +39,10 @@ def read_scorefile(path: str) -> pl.DataFrame:
               'other_allele': pl.Utf8,
               'effect_type': pl.Categorical,
               'accession': pl.Categorical}
-    return (pl.scan_csv(path, sep='\t', dtype=dtypes)
-    .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])
-    .with_columns([
+    return (pl.read_csv(path, sep='\t', dtype=dtypes, n_threads=POLARS_MAX_THREADS)
+            .lazy()
+            .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])).with_columns([
         pl.col("effect_allele").cast(pl.Categorical),
         pl.col("other_allele").cast(pl.Categorical),
         pl.col("effect_allele_FLIP").cast(pl.Categorical),
-        pl.col("other_allele_FLIP").cast(pl.Categorical)
-    ]))
+        pl.col("other_allele_FLIP").cast(pl.Categorical)])
diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 5b30fda..318d420 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -5,7 +5,7 @@
 
 import pandas as pd
 
-from pgscatalog_utils.log_config import set_logging_level
+from pgscatalog_utils.config import set_logging_level
 from pgscatalog_utils.scorefile.effect_type import set_effect_type
 from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights
 from pgscatalog_utils.scorefile.genome_build import build2GRC
diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py
index c3fa792..9fd662d 100644
--- a/pgscatalog_utils/target.py
+++ b/pgscatalog_utils/target.py
@@ -1,8 +1,15 @@
-import zstandard
-from dataclasses import dataclass
+import gc
 import io
 import logging
+import os
+from dataclasses import dataclass
+from itertools import islice
+from tempfile import TemporaryDirectory
+
 import polars as pl
+import zstandard
+
+from pgscatalog_utils.config import POLARS_MAX_THREADS
 
 logger = logging.getLogger(__name__)
 
@@ -13,9 +20,10 @@ class Target:
     file_format: str = None
     path: str = None
     compressed: bool = False
+    low_memory: bool = True  # targets can be big, and use a lot of RAM when reading
 
     @classmethod
-    def from_path(cls, path):
+    def from_path(cls, path, low_memory):
         """ Create a Target object from a path. Cheaply detect file format and headers. """
         try:
             with open(path, 'r') as f:
@@ -30,93 +38,130 @@ def from_path(cls, path):
                 file_format = _get_format(text_stream)
                 compressed = True
 
-        return cls(file_format=file_format, path=path, compressed=compressed)
+        return cls(file_format=file_format, path=path, compressed=compressed, low_memory=low_memory)
 
-    #@profile
     def read(self):
-        if self.compressed:
-            return self._read_compressed_chunks().lazy()
+        if self.low_memory:
+            if self.compressed:
+                logger.debug("Reading compressed chunks from target genome (slower, lower RAM usage)")
+                return self._read_compressed_chunks()
+            else:
+                logger.debug("Reading uncompressed chunks from target genome (slower, lower RAM usage)")
+                return self._read_uncompressed_chunks()
         else:
-            return self._read_uncompressed_chunks().lazy()
-
-    def _read_uncompressed_chunks(self):
-        """ Read a CSV using a BufferedIOReader. This is a bit slower than pl.read_csv() (30s vs 5s).
-
-        Lots of testing showed that lazy scanning and native polars reading used a lot of RAM, then freed a bunch.
-        Plotting RAM usage against time looked like a spiky hedgehog.
-
-        This function linearly consumes RAM in a more linear way by:
-            1. Reading a batch of lines
-            2. Dropping unused columns
-            3. Setting categorical dtypes on read
-            4. Don't rechunk until later
-        """
-        logger.debug("Started reading uncompressed chunks")
-
-        df_lst = []
+            if self.compressed:
+                logger.debug("Reading compressed target genome (fast mode, high RAM usage)")
+                return self._read_compressed()
+            else:
+                logger.debug("Reading uncompressed target genome (fast mode, high RAM usage)")
+                return self._read_uncompressed()
+
+    def _read_compressed(self) -> pl.DataFrame:
+        """ Read a zst compressed target as quickly as possible """
+        with open(self.path, 'rb') as fh:
+            dctx = zstandard.ZstdDecompressor()
+            with dctx.stream_reader(fh) as reader:
+                dtypes = _get_col_dtypes(self.file_format)
+                col_idxs = _get_default_col_idx(self.file_format)
+                new_col_names = _default_cols()
+                return (pl.read_csv(reader, sep='\t', has_header=False, comment_char='#',
+                                    dtype=dtypes,
+                                    columns=col_idxs,
+                                    new_columns=new_col_names,
+                                    n_threads=POLARS_MAX_THREADS))
+
+    def _read_uncompressed(self) -> pl.DataFrame:
+        """ Read an uncompressed target as quickly as possible. Uses up to 16GB RAM on 1000 genomes pvar. """
         dtypes = _get_col_dtypes(self.file_format)
         col_idxs = _get_default_col_idx(self.file_format)
         new_col_names = _default_cols()
+        return (pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#',
+                            dtype=dtypes,
+                            columns=col_idxs,
+                            new_columns=new_col_names,
+                            n_threads=POLARS_MAX_THREADS))
 
-        with open(self.path, "rb") as f:
-            while True:
-                buffer = b''.join(f.readlines(int(1e6)))
+    def _read_uncompressed_chunks(self) -> pl.DataFrame:
+        """ Read a CSV using a BufferedReader in batches to reduce memory usage.
 
-                if not buffer:
-                    break
+        Reads 1 million variant chunks and immediately writes to feather format in a temporary directory.
 
-                df = (pl.read_csv(buffer, sep='\t', has_header=False, comment_char='#', n_threads=1,
-                                  dtype=dtypes,
-                                  columns=col_idxs,
-                                  new_columns=new_col_names,
-                                  rechunk=False))
+        Read all temporary feather files and return a big pl.DataFrame. Reading feather is fast, and preserves dtypes.
 
-                df_lst.append(df)
+        Uses ~ 2GB
+        """
+        dtypes = _get_col_dtypes(self.file_format)
+        col_idxs = _get_default_col_idx(self.file_format)
+        new_col_names = _default_cols()
+        with TemporaryDirectory() as temp_dir:
+            batch_n = 0
+            batch_size = int(1e6)
+            with open(self.path, 'rb') as f:
+                while True:
+                    line_batch = b''.join(islice(f, batch_size))
+                    if not line_batch:
+                        break
+
+                    out_path = os.path.join(temp_dir, str(batch_n) + '.ipc')
+
+                    (pl.read_csv(line_batch, sep='\t', has_header=False, comment_char='#',
+                                 dtype=dtypes,
+                                 columns=col_idxs,
+                                 new_columns=new_col_names,
+                                 n_threads=POLARS_MAX_THREADS).write_ipc(out_path))
+                    batch_n += 1
 
-        logger.debug("Finished reading uncompressed chunks")
-        logger.debug("Concatenating chunked data frames")
-        return pl.concat(df_lst, rechunk=False)
+            gc.collect()  # just to be safe
+            logger.debug(f"{batch_n} batches staged in temporary directory {temp_dir}")
+            return pl.read_ipc(os.path.join(temp_dir, "*.ipc"))
 
-    def _read_compressed_chunks(self):
+    def _read_compressed_chunks(self) -> pl.DataFrame:
         """ Like _read_uncompressed_chunks, but read chunks of bytes and handle incomplete rows
 
         zstd returns chunks of bytes, not lines, but encoding utf-8 will be faster in rust and polars
          """
         logger.debug("Started reading zstd compressed data")
-        df_lst = []
         dtypes = _get_col_dtypes(self.file_format)
         columns = _get_default_col_idx(self.file_format)
         new_col_names = _default_cols()
 
-        with open(self.path, 'rb') as fh:
-            dctx = zstandard.ZstdDecompressor()
-            chunk_buffer = b''
+        n_chunks = 0
 
-            for chunk in dctx.read_to_iter(fh, read_size=int(1e+8)):  # read 100MB of compressed data per chunk
-                if not chunk:
-                    break
-
-                end = chunk.rfind(b'\n') + 1  # only want to read complete rows
-                if chunk_buffer:
-                    row_chunk = b''.join([chunk_buffer, chunk[:end]])
-                    chunk_buffer = b''
-                else:
-                    row_chunk = chunk[:end]
-
-                df = pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#', n_threads=1,
+        with TemporaryDirectory() as temp_dir:
+            with open(self.path, 'rb') as fh:
+                dctx = zstandard.ZstdDecompressor()
+                chunk_buffer = b''
+
+                for chunk in dctx.read_to_iter(fh, read_size=int(1e8), write_size=int(1e8)):
+                    if not chunk:
+                        logger.debug("Finished reading zstd compressed chunks")
+                        break
+
+                    end = chunk.rfind(b'\n') + 1  # only want to read complete rows, which end in \n
+                    if chunk_buffer:
+                        row_chunk = b''.join([chunk_buffer, chunk[:end]])
+                        chunk_buffer = b''
+                    else:
+                        row_chunk = chunk[:end]
+
+                    out_path = os.path.join(temp_dir, str(n_chunks) + ".ipc")
+                    (pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#',
                                  dtype=dtypes,
                                  columns=columns,
                                  new_columns=new_col_names,
-                                 rechunk=False)
-                df_lst.append(df)
-                chunk_buffer = b''.join([chunk_buffer, chunk[end:]])
+                                 n_threads=POLARS_MAX_THREADS)
+                     .write_ipc(out_path))
 
-        logger.debug("Finished reading zstd compressed chunks")
-        logger.debug("Concatenating chunked data frames")
-        return pl.concat(df_lst, rechunk=False)
+                    chunk_buffer = b''.join([chunk_buffer, chunk[end:]])
+                    n_chunks += 1
+
+                gc.collect()  # just to be safe
+                logger.debug(f"{n_chunks} chunks")  # write_size will change n_chunks
+                return pl.read_ipc(os.path.join(temp_dir, "*.ipc"))
 
 
 def _get_default_col_idx(file_format):
+    """ Return a list of column integers to keep, assuming plink default column sets """
     # import default columns:
     #  ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
     match file_format:
@@ -130,7 +175,9 @@ def _get_default_col_idx(file_format):
 
 
 def _get_col_dtypes(file_format):
-    """ Manually set up dtypes. pl.Categorical saves a lot of RAM vs pl.Utf8 """
+    """ Manually set up dtypes to save memory. Repeated strings like REF / ALT / CHROM work best as pl.Categorical.
+
+    ID shouldn't be pl.Categorical, or you'll create a massive string cache and waste RAM """
     match file_format:
         case 'bim':
             # 1. Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name
@@ -139,7 +186,7 @@ def _get_col_dtypes(file_format):
             # 4. Base-pair coordinate (1-based; limited to 231-2)
             # 5. Allele 1 (corresponding to clear bits in .bed; usually minor)
             # 6. Allele 2 (corresponding to set bits in .bed; usually major)
-            d = {'column_1': pl.Categorical, 'column_2': pl.Categorical, 'column_3': pl.Float64, 'column_4': pl.UInt64,
+            d = {'column_1': pl.Categorical, 'column_2': pl.Utf8, 'column_3': pl.Float64, 'column_4': pl.UInt64,
                  'column_5': pl.Categorical, 'column_6': pl.Categorical}
         case 'pvar':
             # 1. CHROM
@@ -150,7 +197,7 @@ def _get_col_dtypes(file_format):
             # 6. QUAL (phred-scaled quality score for whether the locus is variable at all)
             # 7. FILTER ('PASS', '.', or semicolon-separated list of failing filter codes)
             # 8. INFO (semicolon-separated list of flags and key-value pairs, with types declared in header)
-            d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Categorical, 'column_4': pl.Categorical,
+            d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Utf8, 'column_4': pl.Categorical,
                  'column_5': pl.Utf8, 'column_6': pl.Float32, 'column_7': pl.Utf8, 'column_8': pl.Utf8}
             # can't cast ALT to cat yet, because of multiallelic variants!
         case _:
@@ -178,4 +225,3 @@ def _get_format(fh) -> str:
 def _default_cols() -> list[str]:
     """ Standardise column names in a target genome """
     return ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
-
diff --git a/poetry.lock b/poetry.lock
index b8afbdd..7eb7645 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,11 +1,3 @@
-[[package]]
-name = "atomicwrites"
-version = "1.4.1"
-description = "Atomic file writes."
-category = "dev"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-
 [[package]]
 name = "attrs"
 version = "22.1.0"
@@ -22,7 +14,7 @@ tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>
 
 [[package]]
 name = "certifi"
-version = "2022.6.15"
+version = "2022.9.24"
 description = "Python package for providing Mozilla's CA Bundle."
 category = "main"
 optional = false
@@ -41,7 +33,7 @@ pycparser = "*"
 
 [[package]]
 name = "charset-normalizer"
-version = "2.1.0"
+version = "2.1.1"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 category = "main"
 optional = false
@@ -122,7 +114,7 @@ woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"]
 
 [[package]]
 name = "idna"
-version = "3.3"
+version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
 category = "main"
 optional = false
@@ -138,7 +130,7 @@ python-versions = "*"
 
 [[package]]
 name = "jq"
-version = "1.2.2"
+version = "1.3.0"
 description = "jq is a lightweight and flexible JSON processor."
 category = "main"
 optional = false
@@ -185,7 +177,7 @@ psutil = "*"
 
 [[package]]
 name = "numpy"
-version = "1.23.1"
+version = "1.23.3"
 description = "NumPy is the fundamental package for array computing with Python."
 category = "main"
 optional = false
@@ -204,7 +196,7 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
 
 [[package]]
 name = "pandas"
-version = "1.4.3"
+version = "1.5.0"
 description = "Powerful data structures for data analysis, time series, and statistics"
 category = "main"
 optional = false
@@ -216,7 +208,7 @@ python-dateutil = ">=2.8.1"
 pytz = ">=2020.1"
 
 [package.extras]
-test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+test = ["pytest-xdist (>=1.31)", "pytest (>=6.0)", "hypothesis (>=5.5.3)"]
 
 [[package]]
 name = "pillow"
@@ -244,20 +236,21 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "polars"
-version = "0.14.9"
+version = "0.14.14"
 description = "Blazingly fast DataFrame library"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 
 [package.extras]
-pandas = ["pyarrow (>=4.0)", "pandas"]
 connectorx = ["connectorx"]
+pyarrow = ["pyarrow (>=4.0)"]
+timezone = ["backports.zoneinfo", "tzdata"]
+xlsx2csv = ["xlsx2csv (>=0.8.0)"]
 numpy = ["numpy (>=1.16.0)"]
+all = ["polars"]
+pandas = ["pyarrow (>=4.0)", "pandas"]
 fsspec = ["fsspec"]
-xlsx2csv = ["xlsx2csv (>=0.8.0)"]
-pytz = ["pytz"]
-pyarrow = ["pyarrow (>=4.0)"]
 
 [[package]]
 name = "psutil"
@@ -315,14 +308,13 @@ python-versions = ">=3"
 
 [[package]]
 name = "pytest"
-version = "7.1.2"
+version = "7.1.3"
 description = "pytest: simple powerful testing with Python"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 
 [package.dependencies]
-atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
 attrs = ">=19.2.0"
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
 iniconfig = "*"
@@ -362,7 +354,7 @@ six = ">=1.5"
 
 [[package]]
 name = "pytz"
-version = "2022.1"
+version = "2022.2.1"
 description = "World timezone definitions, modern and historical"
 category = "main"
 optional = false
@@ -429,7 +421,7 @@ python-versions = ">=3.7"
 
 [[package]]
 name = "urllib3"
-version = "1.26.11"
+version = "1.26.12"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 category = "main"
 optional = false
@@ -437,7 +429,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*,
 
 [package.extras]
 brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"]
-secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
+secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
 [[package]]
@@ -460,7 +452,6 @@ python-versions = "^3.10"
 content-hash = "a0d60a1fec35d248340f1640db49d07a7000b23e4bbe22426a9c240ee499c334"
 
 [metadata.files]
-atomicwrites = []
 attrs = []
 certifi = []
 cffi = []
@@ -470,10 +461,7 @@ contourpy = []
 coverage = []
 cycler = []
 fonttools = []
-idna = [
-    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
-    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
-]
+idna = []
 iniconfig = [
     {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
@@ -484,29 +472,7 @@ matplotlib = []
 memory-profiler = []
 numpy = []
 packaging = []
-pandas = [
-    {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d51674ed8e2551ef7773820ef5dab9322be0828629f2cbf8d1fc31a0c4fed640"},
-    {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ad23db55efcc93fa878f7837267973b61ea85d244fc5ff0ccbcfa5638706c5"},
-    {file = "pandas-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:958a0588149190c22cdebbc0797e01972950c927a11a900fe6c2296f207b1d6f"},
-    {file = "pandas-1.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e48fbb64165cda451c06a0f9e4c7a16b534fcabd32546d531b3c240ce2844112"},
-    {file = "pandas-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f803320c9da732cc79210d7e8cc5c8019aad512589c910c66529eb1b1818230"},
-    {file = "pandas-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:2893e923472a5e090c2d5e8db83e8f907364ec048572084c7d10ef93546be6d1"},
-    {file = "pandas-1.4.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:24ea75f47bbd5574675dae21d51779a4948715416413b30614c1e8b480909f81"},
-    {file = "pandas-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d5ebc990bd34f4ac3c73a2724c2dcc9ee7bf1ce6cf08e87bb25c6ad33507e318"},
-    {file = "pandas-1.4.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d6c0106415ff1a10c326c49bc5dd9ea8b9897a6ca0c8688eb9c30ddec49535ef"},
-    {file = "pandas-1.4.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78b00429161ccb0da252229bcda8010b445c4bf924e721265bec5a6e96a92e92"},
-    {file = "pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfbf16b1ea4f4d0ee11084d9c026340514d1d30270eaa82a9f1297b6c8ecbf0"},
-    {file = "pandas-1.4.3-cp38-cp38-win32.whl", hash = "sha256:48350592665ea3cbcd07efc8c12ff12d89be09cd47231c7925e3b8afada9d50d"},
-    {file = "pandas-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:605d572126eb4ab2eadf5c59d5d69f0608df2bf7bcad5c5880a47a20a0699e3e"},
-    {file = "pandas-1.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a3924692160e3d847e18702bb048dc38e0e13411d2b503fecb1adf0fcf950ba4"},
-    {file = "pandas-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07238a58d7cbc8a004855ade7b75bbd22c0db4b0ffccc721556bab8a095515f6"},
-    {file = "pandas-1.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:755679c49460bd0d2f837ab99f0a26948e68fa0718b7e42afbabd074d945bf84"},
-    {file = "pandas-1.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41fc406e374590a3d492325b889a2686b31e7a7780bec83db2512988550dadbf"},
-    {file = "pandas-1.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d9382f72a4f0e93909feece6fef5500e838ce1c355a581b3d8f259839f2ea76"},
-    {file = "pandas-1.4.3-cp39-cp39-win32.whl", hash = "sha256:0daf876dba6c622154b2e6741f29e87161f844e64f84801554f879d27ba63c0d"},
-    {file = "pandas-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e"},
-    {file = "pandas-1.4.3.tar.gz", hash = "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c"},
-]
+pandas = []
 pillow = []
 pluggy = [
     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
@@ -530,10 +496,7 @@ python-dateutil = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
 ]
-pytz = [
-    {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
-    {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
-]
+pytz = []
 requests = []
 setuptools-scm = []
 six = [

From 5c15a67a32889c59b376d956a66522ab61b6b53e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 3 Oct 2022 10:15:39 +0100
Subject: [PATCH 26/46] fix reading bim files

---
 pgscatalog_utils/match/match.py          |  1 +
 pgscatalog_utils/match/match_variants.py |  5 +++
 pgscatalog_utils/target.py               | 46 +++++++++++-------------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 7a9e0f3..d0aeccf 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -10,6 +10,7 @@
 logger = logging.getLogger(__name__)
 
 
+# @profile  # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling
 def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bool, remove_ambiguous: bool,
                     keep_first_match: bool, low_memory: bool) -> pl.DataFrame:
     scorefile_oa = scorefile.filter(pl.col("other_allele") != None)
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index e85d154..187f436 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -1,6 +1,7 @@
 import argparse
 import logging
 import os
+import sys
 import textwrap
 from glob import glob
 
@@ -29,6 +30,10 @@ def match_variants():
         n_target_files = len(glob(args.target))
         matches: pl.DataFrame
 
+        if n_target_files == 0:
+            logger.critical("No target genomes found, check the path")
+            sys.exit(1)
+
         if n_target_files == 1 and not args.fast:
             low_memory: bool = True
             match_mode: str = 'single'
diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py
index 9fd662d..fbbcb8f 100644
--- a/pgscatalog_utils/target.py
+++ b/pgscatalog_utils/target.py
@@ -40,6 +40,7 @@ def from_path(cls, path, low_memory):
 
         return cls(file_format=file_format, path=path, compressed=compressed, low_memory=low_memory)
 
+    # @profile  # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling
     def read(self):
         if self.low_memory:
             if self.compressed:
@@ -62,8 +63,7 @@ def _read_compressed(self) -> pl.DataFrame:
             dctx = zstandard.ZstdDecompressor()
             with dctx.stream_reader(fh) as reader:
                 dtypes = _get_col_dtypes(self.file_format)
-                col_idxs = _get_default_col_idx(self.file_format)
-                new_col_names = _default_cols()
+                col_idxs, new_col_names = _default_cols(self.file_format)
                 return (pl.read_csv(reader, sep='\t', has_header=False, comment_char='#',
                                     dtype=dtypes,
                                     columns=col_idxs,
@@ -73,8 +73,7 @@ def _read_compressed(self) -> pl.DataFrame:
     def _read_uncompressed(self) -> pl.DataFrame:
         """ Read an uncompressed target as quickly as possible. Uses up to 16GB RAM on 1000 genomes pvar. """
         dtypes = _get_col_dtypes(self.file_format)
-        col_idxs = _get_default_col_idx(self.file_format)
-        new_col_names = _default_cols()
+        col_idxs, new_col_names = _default_cols(self.file_format)
         return (pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#',
                             dtype=dtypes,
                             columns=col_idxs,
@@ -91,8 +90,7 @@ def _read_uncompressed_chunks(self) -> pl.DataFrame:
         Uses ~ 2GB
         """
         dtypes = _get_col_dtypes(self.file_format)
-        col_idxs = _get_default_col_idx(self.file_format)
-        new_col_names = _default_cols()
+        col_idxs, new_col_names = _default_cols(self.file_format)
         with TemporaryDirectory() as temp_dir:
             batch_n = 0
             batch_size = int(1e6)
@@ -122,8 +120,7 @@ def _read_compressed_chunks(self) -> pl.DataFrame:
          """
         logger.debug("Started reading zstd compressed data")
         dtypes = _get_col_dtypes(self.file_format)
-        columns = _get_default_col_idx(self.file_format)
-        new_col_names = _default_cols()
+        columns, new_col_names = _default_cols(self.file_format)
 
         n_chunks = 0
 
@@ -160,20 +157,6 @@ def _read_compressed_chunks(self) -> pl.DataFrame:
                 return pl.read_ipc(os.path.join(temp_dir, "*.ipc"))
 
 
-def _get_default_col_idx(file_format):
-    """ Return a list of column integers to keep, assuming plink default column sets """
-    # import default columns:
-    #  ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
-    match file_format:
-        case 'bim':
-            return [0, 1, 3, 4, 5]  # see _get_col_dtypes, dropping centimorgans
-        case 'pvar':
-            return [0, 1, 2, 3, 4]  # dropping QUAL FILTER INFO etc
-        case _:
-            logger.critical("Trying to get column idx for an invalid file format, TWENTY THREE NINETEEN")
-            raise Exception
-
-
 def _get_col_dtypes(file_format):
     """ Manually set up dtypes to save memory. Repeated strings like REF / ALT / CHROM work best as pl.Categorical.
 
@@ -187,7 +170,7 @@ def _get_col_dtypes(file_format):
             # 5. Allele 1 (corresponding to clear bits in .bed; usually minor)
             # 6. Allele 2 (corresponding to set bits in .bed; usually major)
             d = {'column_1': pl.Categorical, 'column_2': pl.Utf8, 'column_3': pl.Float64, 'column_4': pl.UInt64,
-                 'column_5': pl.Categorical, 'column_6': pl.Categorical}
+                 'column_5': pl.Categorical, 'column_6': pl.Utf8}
         case 'pvar':
             # 1. CHROM
             # 2. POS (base-pair coordinate)
@@ -222,6 +205,17 @@ def _get_format(fh) -> str:
     return file_format
 
 
-def _default_cols() -> list[str]:
-    """ Standardise column names in a target genome """
-    return ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
+def _default_cols(file_format) -> tuple[list[int], list[str]]:
+    """ Return a list of column integers to keep, assuming plink default column sets """
+    match file_format:
+        case 'bim':
+            idxs = [0, 1, 3, 4, 5]  # see _get_col_dtypes, dropping centimorgans
+            names = ['#CHROM', 'ID', 'POS', 'REF', 'ALT']  # technically A1/A2, but it's ok
+            return idxs, names
+        case 'pvar':
+            idxs = [0, 1, 2, 3, 4]  # dropping QUAL FILTER INFO etc
+            names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
+            return idxs, names
+        case _:
+            logger.critical("Trying to get column idx for an invalid file format, TWENTY THREE NINETEEN")
+            raise Exception

From 0dc745dada2bd18a4bc4b47ab66933d7ff1c0210 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 3 Oct 2022 11:50:51 +0100
Subject: [PATCH 27/46] fix tests

---
 tests/match/test_label.py | 12 ++++++------
 tests/match/test_match.py | 32 +++++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/tests/match/test_label.py b/tests/match/test_label.py
index 8198335..bf354bd 100644
--- a/tests/match/test_label.py
+++ b/tests/match/test_label.py
@@ -29,7 +29,7 @@ def test_label(small_scorefile, small_target):
     scorefile, target = _cast_cat(small_scorefile, small_target)
 
     # get_all_matches calls label_matches
-    labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False)
+    labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False).collect()
 
     logger.debug(labelled.select(['ID', 'match_type', 'best_match', 'ambiguous', 'match_status', 'exclude']))
 
@@ -43,7 +43,7 @@ def test_ambiguous_label(small_flipped_scorefile, small_target):
     """ Test ambiguous variant labels change when they're kept for match candidates with one match per position """
     scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
 
-    no_ambiguous = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False)
+    no_ambiguous = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False).collect()
 
     assert no_ambiguous['best_match'].to_list() == [True]
     assert no_ambiguous['ambiguous'].to_list() == [True]
@@ -51,7 +51,7 @@ def test_ambiguous_label(small_flipped_scorefile, small_target):
     assert no_ambiguous['match_status'].to_list() == ["excluded"]
 
     # otherwise, ambiguous variants are kept
-    labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
+    labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False).collect()
 
     assert labelled['best_match'].to_list() == [True]
     assert labelled['ambiguous'].to_list() == [True]
@@ -105,7 +105,7 @@ def duplicated_matches(small_scorefile, small_target, request):
 
     scorefile, target = _cast_cat(dups, small_target)
 
-    return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=request.param)
+    return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=request.param).collect()
 
 
 @pytest.fixture
@@ -113,7 +113,7 @@ def multiple_match_types(small_target, small_scorefile):
     # skip flip will return two candidate matches for one target position: refalt + refalt_flip
     scorefile, target = _cast_cat(small_scorefile, small_target)
     return (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
-            .filter(pl.col('chr_name') == 2))
+            .filter(pl.col('chr_name') == '2')).collect()
 
 
 @pytest.fixture
@@ -122,4 +122,4 @@ def duplicate_best_match(small_target, small_scorefile_no_oa):
     odd_target = {'#CHROM': [1, 1], 'POS': [1, 1], 'REF': ['T', 'C'], 'ALT': ['A', 'A'], 'ID': ['1:1:T:C', '1:1:A:A'],
                   'is_multiallelic': [False, False]}
     scorefile, target = _cast_cat(small_scorefile_no_oa, pl.DataFrame(odd_target))
-    return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
+    return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False).collect()
diff --git a/tests/match/test_match.py b/tests/match/test_match.py
index 2c1c8f4..b8fbb07 100644
--- a/tests/match/test_match.py
+++ b/tests/match/test_match.py
@@ -5,7 +5,7 @@
 import polars as pl
 import pytest
 
-from pgscatalog_utils.match.match import get_all_matches, _cast_categorical
+from pgscatalog_utils.match.match import get_all_matches
 from pgscatalog_utils.match.match_variants import match_variants
 
 
@@ -38,9 +38,23 @@ def test_match_pass(mini_scorefile, target_path, tmp_path):
         match_variants()
 
 
-def _cast_cat(scorefile, target):
+def _cast_cat(scorefile, target) -> tuple[pl.LazyFrame, pl.LazyFrame]:
     with pl.StringCache():
-        return _cast_categorical(scorefile, target)
+        scorefile = scorefile.with_columns([
+            pl.col("chr_name").cast(pl.Utf8).cast(pl.Categorical),
+            pl.col("effect_allele").cast(pl.Categorical),
+            pl.col("other_allele").cast(pl.Categorical),
+            pl.col("effect_type").cast(pl.Categorical),
+            pl.col("effect_allele_FLIP").cast(pl.Categorical),
+            pl.col("other_allele_FLIP").cast(pl.Categorical),
+            pl.col("accession").cast(pl.Categorical)
+        ])
+        target = target.with_columns([
+            pl.col("#CHROM").cast(pl.Utf8).cast(pl.Categorical),
+            pl.col("REF").cast(pl.Categorical),
+            pl.col("ALT").cast(pl.Categorical)
+        ])
+        return scorefile.lazy(), target.lazy()
 
 
 def test_match_strategies(small_scorefile, small_target):
@@ -48,13 +62,13 @@ def test_match_strategies(small_scorefile, small_target):
 
     # check unambiguous matches
     df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
-          .filter(pl.col('ambiguous') == False))
+          .filter(pl.col('ambiguous') == False)).collect()
     assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'})
     assert set(df['match_type'].to_list()).issubset(['altref', 'refalt'])
 
     # when keeping ambiguous and flipping alleles
     flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
-            .filter(pl.col('ambiguous') == True))
+            .filter(pl.col('ambiguous') == True)).collect()
 
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'altref', 'refalt_flip'})
@@ -64,14 +78,14 @@ def test_no_oa_match(small_scorefile_no_oa, small_target):
     scorefile, target = _cast_cat(small_scorefile_no_oa, small_target)
 
     df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
-          .filter(pl.col('ambiguous') == False))
+          .filter(pl.col('ambiguous') == False)).collect()
 
     assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
     assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref'])
 
     # check ambiguous matches
     flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
-            .filter(pl.col('ambiguous') == True))
+            .filter(pl.col('ambiguous') == True)).collect()
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'no_oa_alt', 'no_oa_ref_flip'})
 
@@ -79,12 +93,12 @@ def test_no_oa_match(small_scorefile_no_oa, small_target):
 def test_flip_match(small_flipped_scorefile, small_target):
     scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
 
-    df = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
+    df = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False).collect()
     assert set(df['ambiguous']) == {True}
     assert set(df['match_type']) == {'refalt'}
 
     flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
-            .filter(pl.col('ambiguous') == False))
+            .filter(pl.col('ambiguous') == False)).collect()
     assert flip['match_type'].str.contains('flip').all()
     assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
 

From ba793fbf3194381b3a967cbfd1aba8d82ff2295d Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 3 Oct 2022 11:51:06 +0100
Subject: [PATCH 28/46] fix types

---
 pgscatalog_utils/match/match.py      |  8 ++++----
 pgscatalog_utils/match/preprocess.py | 13 ++-----------
 pgscatalog_utils/match/read.py       |  3 +--
 3 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index d0aeccf..049da3a 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -12,7 +12,7 @@
 
 # @profile  # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling
 def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bool, remove_ambiguous: bool,
-                    keep_first_match: bool, low_memory: bool) -> pl.DataFrame:
+                    keep_first_match: bool, low_memory: bool = True) -> pl.LazyFrame:
     scorefile_oa = scorefile.filter(pl.col("other_allele") != None)
     scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None)
 
@@ -42,10 +42,10 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bo
         logger.debug("Collecting all matches (parallel)")
         match_lf = pl.concat(pl.collect_all(matches))
 
-    return match_lf.pipe(label_matches, remove_ambiguous, keep_first_match)
+    return match_lf.lazy().pipe(label_matches, remove_ambiguous, keep_first_match)
 
 
-def _batch_collect(matches: list[pl.LazyFrame]):
+def _batch_collect(matches: list[pl.LazyFrame]) -> pl.DataFrame:
     """ A slower alternative to pl.collect_all(), but this approach will use less peak memory
 
     This batches the .collect() and writes intermediate results to a temporary working directory
@@ -59,7 +59,7 @@ def _batch_collect(matches: list[pl.LazyFrame]):
             n_chunks += 1
         logger.debug(f"Staged {n_chunks} match chunks to {temp_dir}")
         gc.collect()
-        return pl.read_ipc(os.path.join(temp_dir, "*.ipc")).lazy()
+        return pl.read_ipc(os.path.join(temp_dir, "*.ipc"))
 
 
 def _match_variants(scorefile: pl.LazyFrame, target: pl.LazyFrame, match_type: str) -> pl.LazyFrame:
diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index 3f0c38d..de2711f 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -5,7 +5,7 @@
 logger = logging.getLogger(__name__)
 
 
-def filter_target(df):
+def filter_target(df: pl.DataFrame) -> pl.DataFrame:
     """ Remove variants that won't be matched against the scorefile
 
     Chromosomes 1 - 22, X, and Y with an efficient join. Remmove variants with missing identifiers also
@@ -45,11 +45,7 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataF
         .otherwise(pl.lit(False))
         .alias('is_multiallelic')))
 
-    multiallelic_canary = (df.filter(pl.col('is_multiallelic') == True)
-                           .limit(1)  # just detect the first occurring
-                           .collect())
-
-    if not multiallelic_canary.is_empty():
+    if (df.get_column('is_multiallelic')).any():
         logger.debug("Multiallelic variants detected")
         if remove_multiallelic:
             logger.debug('Dropping multiallelic variants')
@@ -61,8 +57,3 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataF
     else:
         logger.debug("No multiallelic variants detected")
         return df
-
-
-def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame:
-    df.with_column(
-        pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic'))
diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index 22271cf..6bdcfc5 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -25,10 +25,9 @@ def read_target(path: str, remove_multiallelic: bool, low_memory: bool) -> pl.La
     logger.debug("Reading all target data complete")
     # handling multiallelic requires str methods, so don't forget to cast back or matching will break
     return (pl.concat([x.read() for x in targets])
-            .lazy()
             .pipe(filter_target)
             .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic)
-            .with_column(pl.col('ALT').cast(pl.Categorical)))
+            .with_column(pl.col('ALT').cast(pl.Categorical))).lazy()
 
 
 def read_scorefile(path: str) -> pl.LazyFrame:

From a171e5b9cde0db9ab315ebdb2b76ffb359a82ab1 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 4 Oct 2022 13:42:32 +0100
Subject: [PATCH 29/46] update poetry lock file

---
 poetry.lock | 73 +++++++++++++++++++----------------------------------
 1 file changed, 26 insertions(+), 47 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 0d15470..2ae26df 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -70,7 +70,7 @@ bokeh = ["selenium", "bokeh"]
 
 [[package]]
 name = "coverage"
-version = "6.4.4"
+version = "6.5.0"
 description = "Code coverage measurement for Python"
 category = "dev"
 optional = false
@@ -92,7 +92,7 @@ python-versions = ">=3.6"
 
 [[package]]
 name = "fonttools"
-version = "4.37.3"
+version = "4.37.4"
 description = "Tools to manipulate font files"
 category = "dev"
 optional = false
@@ -187,7 +187,7 @@ python-versions = ">=3.8"
 name = "packaging"
 version = "21.3"
 description = "Core utilities for Python packages"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 
@@ -210,18 +210,6 @@ pytz = ">=2020.1"
 [package.extras]
 test = ["pytest-xdist (>=1.31)", "pytest (>=6.0)", "hypothesis (>=5.5.3)"]
 
-[[package]]
-name = "pillow"
-version = "9.2.0"
-description = "Python Imaging Library (Fork)"
-category = "dev"
-optional = false
-python-versions = ">=3.7"
-
-[package.extras]
-docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"]
-tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
-
 [[package]]
 name = "pandas-schema"
 version = "0.3.6"
@@ -235,6 +223,18 @@ numpy = "*"
 packaging = "*"
 pandas = ">=0.19"
 
+[[package]]
+name = "pillow"
+version = "9.2.0"
+description = "Python Imaging Library (Fork)"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"]
+tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
+
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@@ -249,21 +249,22 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "polars"
-version = "0.14.14"
+version = "0.14.17"
 description = "Blazingly fast DataFrame library"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 
 [package.extras]
+pandas = ["pyarrow (>=4.0.0)", "pandas"]
 connectorx = ["connectorx"]
-pyarrow = ["pyarrow (>=4.0)"]
-timezone = ["backports.zoneinfo", "tzdata"]
 xlsx2csv = ["xlsx2csv (>=0.8.0)"]
+timezone = ["backports.zoneinfo", "tzdata"]
+matplotlib = ["matplotlib"]
+fsspec = ["fsspec"]
 numpy = ["numpy (>=1.16.0)"]
 all = ["polars"]
-pandas = ["pyarrow (>=4.0)", "pandas"]
-fsspec = ["fsspec"]
+pyarrow = ["pyarrow (>=4.0.0)"]
 
 [[package]]
 name = "psutil"
@@ -304,7 +305,7 @@ python-versions = "*"
 name = "pyparsing"
 version = "3.0.9"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6.8"
 
@@ -367,7 +368,7 @@ six = ">=1.5"
 
 [[package]]
 name = "pytz"
-version = "2022.2.1"
+version = "2022.4"
 description = "World timezone definitions, modern and historical"
 category = "main"
 optional = false
@@ -462,7 +463,7 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.10"
-content-hash = "a0d60a1fec35d248340f1640db49d07a7000b23e4bbe22426a9c240ee499c334"
+content-hash = "84b4520b176bb1b892c870fe894814cd05e217a86d7b4fadfa638b91a919bae5"
 
 [metadata.files]
 attrs = []
@@ -485,31 +486,9 @@ matplotlib = []
 memory-profiler = []
 numpy = []
 packaging = []
-pillow = []
-pandas = [
-    {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d51674ed8e2551ef7773820ef5dab9322be0828629f2cbf8d1fc31a0c4fed640"},
-    {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ad23db55efcc93fa878f7837267973b61ea85d244fc5ff0ccbcfa5638706c5"},
-    {file = "pandas-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:958a0588149190c22cdebbc0797e01972950c927a11a900fe6c2296f207b1d6f"},
-    {file = "pandas-1.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e48fbb64165cda451c06a0f9e4c7a16b534fcabd32546d531b3c240ce2844112"},
-    {file = "pandas-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f803320c9da732cc79210d7e8cc5c8019aad512589c910c66529eb1b1818230"},
-    {file = "pandas-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:2893e923472a5e090c2d5e8db83e8f907364ec048572084c7d10ef93546be6d1"},
-    {file = "pandas-1.4.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:24ea75f47bbd5574675dae21d51779a4948715416413b30614c1e8b480909f81"},
-    {file = "pandas-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d5ebc990bd34f4ac3c73a2724c2dcc9ee7bf1ce6cf08e87bb25c6ad33507e318"},
-    {file = "pandas-1.4.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d6c0106415ff1a10c326c49bc5dd9ea8b9897a6ca0c8688eb9c30ddec49535ef"},
-    {file = "pandas-1.4.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78b00429161ccb0da252229bcda8010b445c4bf924e721265bec5a6e96a92e92"},
-    {file = "pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfbf16b1ea4f4d0ee11084d9c026340514d1d30270eaa82a9f1297b6c8ecbf0"},
-    {file = "pandas-1.4.3-cp38-cp38-win32.whl", hash = "sha256:48350592665ea3cbcd07efc8c12ff12d89be09cd47231c7925e3b8afada9d50d"},
-    {file = "pandas-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:605d572126eb4ab2eadf5c59d5d69f0608df2bf7bcad5c5880a47a20a0699e3e"},
-    {file = "pandas-1.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a3924692160e3d847e18702bb048dc38e0e13411d2b503fecb1adf0fcf950ba4"},
-    {file = "pandas-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07238a58d7cbc8a004855ade7b75bbd22c0db4b0ffccc721556bab8a095515f6"},
-    {file = "pandas-1.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:755679c49460bd0d2f837ab99f0a26948e68fa0718b7e42afbabd074d945bf84"},
-    {file = "pandas-1.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41fc406e374590a3d492325b889a2686b31e7a7780bec83db2512988550dadbf"},
-    {file = "pandas-1.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d9382f72a4f0e93909feece6fef5500e838ce1c355a581b3d8f259839f2ea76"},
-    {file = "pandas-1.4.3-cp39-cp39-win32.whl", hash = "sha256:0daf876dba6c622154b2e6741f29e87161f844e64f84801554f879d27ba63c0d"},
-    {file = "pandas-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e"},
-    {file = "pandas-1.4.3.tar.gz", hash = "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c"},
-]
+pandas = []
 pandas-schema = []
+pillow = []
 pluggy = [
     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},

From 593757c0f37e2c652d882758507b275462e0167e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 4 Oct 2022 14:37:56 +0100
Subject: [PATCH 30/46] treat lists of files consistently

---
 .../aggregate/aggregate_scores.py             |  6 +--
 pgscatalog_utils/match/match_variants.py      | 40 +++++++++----------
 pgscatalog_utils/match/read.py                | 14 +------
 3 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py
index 6109a7f..653a81d 100644
--- a/pgscatalog_utils/aggregate/aggregate_scores.py
+++ b/pgscatalog_utils/aggregate/aggregate_scores.py
@@ -13,7 +13,7 @@
 def aggregate_scores():
     args = _parse_args()
     set_logging_level(args.verbose)
-    df = aggregate(glob.glob(args.scores))
+    df = aggregate(list(set(args.scores)))
     logger.debug("Compressing and writing combined scores")
     df.to_csv('aggregated_scores.txt.gz', sep='\t', compression='gzip')
 
@@ -78,8 +78,8 @@ def _description_text() -> str:
 def _parse_args(args=None) -> argparse.Namespace:
     parser = argparse.ArgumentParser(description=_description_text(),
                                      formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument('-s', '--scores', dest='scores', required=True,
-                        help='<Required> Path to scorefiles. Use a wildcard (*) to select multiple files.')
+    parser.add_argument('-s', '--scores', dest='scores', required=True, nargs='+',
+                        help='<Required> List of scorefile paths. Use a wildcard (*) to select multiple files.')
     parser.add_argument('-o', '--outdir', dest='outdir', required=True,
                         default='scores/', help='<Required> Output directory to store downloaded files')
     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 187f436..6cc1747 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -27,7 +27,8 @@ def match_variants():
 
     with pl.StringCache():
         scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile)
-        n_target_files = len(glob(args.target))
+        target_paths = list(set(args.target))
+        n_target_files = len(target_paths)
         matches: pl.DataFrame
 
         if n_target_files == 0:
@@ -46,18 +47,20 @@ def match_variants():
 
         match match_mode:
             case "single":
-                logger.debug(f"Match mode: {match_mode}")  # read one target in chunks
-                matches: pl.LazyFrame = _match_single_target(args.target, scorefile, args.remove_multiallelic,
+                logger.debug(f"Match mode: {match_mode}")
+                # _fast_match with low_memory = True reads one target in chunks
+                matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args.remove_multiallelic,
                                                              args.skip_flip, args.remove_ambiguous,
                                                              args.keep_first_match, low_memory)
             case "multi":
                 logger.debug(f"Match mode: {match_mode}")  # iterate over multiple targets, in chunks
-                matches: pl.LazyFrame = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic,
+                matches: pl.LazyFrame = _match_multiple_targets(target_paths, scorefile, args.remove_multiallelic,
                                                                 args.skip_flip, args.remove_ambiguous,
                                                                 args.keep_first_match, low_memory)
             case "fast":
-                logger.debug(f"Match mode: {match_mode}")  # just read everything into memory for speed
-                matches: pl.LazyFrame = _fast_match(args.target, scorefile, args.remove_multiallelic, args.skip_flip,
+                logger.debug(f"Match mode: {match_mode}")
+                # _fast_match with low_memory = False just read everything into memory for speed
+                matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args.remove_multiallelic, args.skip_flip,
                                                     args.remove_ambiguous, args.keep_first_match, low_memory)
             case _:
                 logger.critical(f"Invalid match mode: {match_mode}")
@@ -87,30 +90,23 @@ def _check_target_chroms(target: pl.LazyFrame) -> None:
         logger.debug("Split target genome contains one chromosome (good)")
 
 
-def _fast_match(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool,
+def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, remove_multiallelic: bool,
                 skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, low_memory: bool) -> pl.LazyFrame:
     # fast match is fast because:
-    #   1) all target files are read into memory
+    #   1) all target files are read into memory without batching
     #   2) matching occurs without iterating through chromosomes
-    target: pl.LazyFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic, low_memory=low_memory)
-    logger.debug("Split target chromosomes not checked with fast match mode")
+    # when low memory is true and n_targets = 1, fast match is the same as "single" match mode
+    target: pl.LazyFrame = read_target(paths=target_paths, remove_multiallelic=remove_multiallelic, low_memory=low_memory)
     return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy()
 
 
-def _match_single_target(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool,
-                         skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool,
-                         low_memory: bool) -> pl.LazyFrame:
-    target: pl.LazyFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic, low_memory=low_memory)
-    return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy()
-
-
-def _match_multiple_targets(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool,
+def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, remove_multiallelic: bool,
                             skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool,
                             low_memory: bool) -> pl.LazyFrame:
     matches = []
-    for i, loc_target_current in enumerate(glob(target_path)):
+    for i, loc_target_current in enumerate(target_paths):
         logger.debug(f'Matching scorefile(s) against target: {loc_target_current}')
-        target: pl.LazyFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic,
+        target: pl.LazyFrame = read_target(paths=[loc_target_current], remove_multiallelic=remove_multiallelic,
                                            low_memory=low_memory)
         _check_target_chroms(target)
         matches.append(get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory))
@@ -171,8 +167,8 @@ def _parse_args(args=None):
                         help='<Required> Label for target genomic dataset')
     parser.add_argument('-s', '--scorefiles', dest='scorefile', required=True,
                         help='<Required> Combined scorefile path (output of read_scorefiles.py)')
-    parser.add_argument('-t', '--target', dest='target', required=True,
-                        help='<Required> A table of target genomic variants (.bim format)')
+    parser.add_argument('-t', '--target', dest='target', required=True, nargs='+',
+                        help='<Required> A list of paths of target genomic variants (.bim format)')
     parser.add_argument('-f', '--fast', dest='fast', action='store_true',
                         help='<Optional> Enable faster matching at the cost of increased RAM usage')
     parser.add_argument('--split', dest='split', default=False, action='store_true',
diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index 6bdcfc5..ef12543 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -1,8 +1,7 @@
-import glob
+
 import logging
 
 import polars as pl
-
 from pgscatalog_utils.config import POLARS_MAX_THREADS
 from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target
 from pgscatalog_utils.target import Target
@@ -10,16 +9,7 @@
 logger = logging.getLogger(__name__)
 
 
-def read_target(path: str, remove_multiallelic: bool, low_memory: bool) -> pl.LazyFrame:
-    """ Read one or more targets from a path (may contain a wildcard) """
-
-    if '*' in path:
-        logger.debug("Wildcard detected in target path: finding all matching files")
-        paths: list[str] = glob.glob(path)
-    else:
-        logger.debug("Found one matching target")
-        paths: list[str] = [path]
-
+def read_target(paths: list[str], remove_multiallelic: bool, low_memory: bool) -> pl.LazyFrame:
     targets: list[Target] = [Target.from_path(x, low_memory) for x in paths]
 
     logger.debug("Reading all target data complete")

From 353d8f2349ebdb197987b91c41fb6232eb7ec523 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Tue, 4 Oct 2022 15:08:05 +0100
Subject: [PATCH 31/46] Setup a user agent for the download_scorefiles utils
 (REST API calls to the PGS Catalog)

---
 pgscatalog_utils/download/download_scorefile.py | 12 +++++++++---
 pgscatalog_utils/download/publication.py        |  4 ++--
 pgscatalog_utils/download/score.py              | 17 +++++++++++------
 pgscatalog_utils/download/trait.py              |  4 ++--
 4 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py
index f31c7ab..72a643d 100644
--- a/pgscatalog_utils/download/download_scorefile.py
+++ b/pgscatalog_utils/download/download_scorefile.py
@@ -33,13 +33,17 @@ def download_scorefile() -> None:
 
     pgs_lst: list[list[str]] = []
 
+    pgsc_calc_info = None
+    if args.pgsc_calc:
+        pgsc_calc_info = args.pgsc_calc
+
     if args.efo:
         logger.debug("--trait set, querying traits")
-        pgs_lst = pgs_lst + [query_trait(x) for x in args.efo]
+        pgs_lst = pgs_lst + [query_trait(x, pgsc_calc_info) for x in args.efo]
 
     if args.pgp:
         logger.debug("--pgp set, querying publications")
-        pgs_lst = pgs_lst + [query_publication(x) for x in args.pgp]
+        pgs_lst = pgs_lst + [query_publication(x, pgsc_calc_info) for x in args.pgp]
 
     if args.pgs:
         logger.debug("--id set, querying scores")
@@ -47,7 +51,7 @@ def download_scorefile() -> None:
 
     pgs_id: list[str] = list(set(reduce(lambda x, y: x + y, pgs_lst)))
 
-    urls: dict[str, str] = get_url(pgs_id, args.build)
+    urls: dict[str, str] = get_url(pgs_id, args.build, pgsc_calc_info)
 
     for pgsid, url in urls.items():
         logger.debug(f"Downloading {pgsid} from {url}")
@@ -135,6 +139,8 @@ def _parse_args(args=None) -> argparse.Namespace:
     parser.add_argument('-o', '--outdir', dest='outdir', required=True,
                         default='scores/',
                         help='<Required> Output directory to store downloaded files')
+    parser.add_argument('-c', '--pgsc_calc', dest='pgsc_calc',
+                        help='<Optional> Provide information about downloading scoring files via pgsc_calc')
     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
                         help='<Optional> Extra logging information')
     return parser.parse_args(args)
diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py
index 56c7f7b..675b263 100644
--- a/pgscatalog_utils/download/publication.py
+++ b/pgscatalog_utils/download/publication.py
@@ -6,10 +6,10 @@
 logger = logging.getLogger(__name__)
 
 
-def query_publication(pgp: str) -> list[str]:
+def query_publication(pgp: str, user_agent:str = None) -> list[str]:
     logger.debug("Querying PGS Catalog with publication PGP ID")
     api: str = f'/publication/{pgp}'
-    results_json = query_api(api)
+    results_json = query_api(api, user_agent)
 
     if results_json == {} or results_json == None:
         logger.critical(f"Bad response from PGS Catalog for EFO term: {pgp}")
diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py
index edad470..4b73916 100644
--- a/pgscatalog_utils/download/score.py
+++ b/pgscatalog_utils/download/score.py
@@ -1,6 +1,7 @@
 import logging
 import sys
 
+import pgscatalog_utils
 import jq
 import requests
 import time
@@ -8,13 +9,13 @@
 logger = logging.getLogger(__name__)
 
 
-def get_url(pgs: list[str], build: str) -> dict[str, str]:
+def get_url(pgs: list[str], build: str, user_agent:str = None) -> dict[str, str]:
     pgs_result: list[str] = []
     url_result: list[str] = []
 
     for chunk in _chunker(pgs):
         try:
-            response = _parse_json_query(query_score(chunk), build)
+            response = _parse_json_query(query_score(chunk,user_agent), build)
             pgs_result = pgs_result + list(response.keys())
             url_result = url_result + list(response.values())
         except TypeError:
@@ -29,13 +30,17 @@ def get_url(pgs: list[str], build: str) -> dict[str, str]:
     return dict(zip(pgs_result, url_result))
 
 
-def query_api(api: str, retry:int = 0) -> dict:
+def query_api(api: str, user_agent:str = None, retry:int = 0) -> dict:
     max_retries = 5
     wait = 60
     results_json = None
     rest_url_root = 'https://www.pgscatalog.org/rest'
+    # Set pgscatalog_utils user agent if none provided
+    if not user_agent:
+        user_agent = 'pgscatalog_utils/'+pgscatalog_utils.__version__
     try:
-        r: requests.models.Response = requests.get(rest_url_root+api)
+        headers = {'User-Agent': user_agent}
+        r: requests.models.Response = requests.get(rest_url_root+api, headers=headers)
         r.raise_for_status()
         results_json = r.json()
     except requests.exceptions.HTTPError as e:
@@ -54,10 +59,10 @@ def query_api(api: str, retry:int = 0) -> dict:
     return results_json
 
 
-def query_score(pgs_id: list[str]) -> dict:
+def query_score(pgs_id: list[str], user_agent:str = None) -> dict:
     pgs: str = ','.join(pgs_id)
     api: str = f'/score/search?pgs_ids={pgs}'
-    results_json = query_api(api)
+    results_json = query_api(api, user_agent)
     return results_json
 
 
diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py
index 83af414..609e3e1 100644
--- a/pgscatalog_utils/download/trait.py
+++ b/pgscatalog_utils/download/trait.py
@@ -6,10 +6,10 @@
 logger = logging.getLogger(__name__)
 
 
-def query_trait(trait: str) -> list[str]:
+def query_trait(trait: str, user_agent:str = None) -> list[str]:
     logger.debug(f"Querying PGS Catalog with trait {trait}")
     api: str = f'/trait/{trait}?include_children=1'
-    results_json = query_api(api)
+    results_json = query_api(api, user_agent)
 
     if results_json == {} or results_json == None:
         logger.critical(f"Bad response from PGS Catalog for EFO term: {trait}")

From cc41b4f7db1bfe829fcbb65203a0973c31df7b12 Mon Sep 17 00:00:00 2001
From: Laurent Gil <lg10@sanger.ac.uk>
Date: Tue, 4 Oct 2022 15:13:33 +0100
Subject: [PATCH 32/46] Improve library call

---
 pgscatalog_utils/download/score.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py
index 4b73916..3c2bf29 100644
--- a/pgscatalog_utils/download/score.py
+++ b/pgscatalog_utils/download/score.py
@@ -1,10 +1,10 @@
 import logging
 import sys
 
-import pgscatalog_utils
 import jq
 import requests
 import time
+from pgscatalog_utils import __version__ as pgscatalog_utils_version
 
 logger = logging.getLogger(__name__)
 
@@ -37,7 +37,7 @@ def query_api(api: str, user_agent:str = None, retry:int = 0) -> dict:
     rest_url_root = 'https://www.pgscatalog.org/rest'
     # Set pgscatalog_utils user agent if none provided
     if not user_agent:
-        user_agent = 'pgscatalog_utils/'+pgscatalog_utils.__version__
+        user_agent = 'pgscatalog_utils/'+pgscatalog_utils_version
     try:
         headers = {'User-Agent': user_agent}
         r: requests.models.Response = requests.get(rest_url_root+api, headers=headers)

From 4eec95b2e9fecb70afb2b1723a078a462266b45c Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 4 Oct 2022 16:50:28 +0100
Subject: [PATCH 33/46] don't hold scorefiles in memory when combining them

---
 .../scorefile/combine_scorefiles.py           | 24 +++++++------------
 pgscatalog_utils/scorefile/write.py           | 15 ++++++++++--
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index 318d420..e8dc610 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -1,10 +1,9 @@
 import argparse
 import logging
+import os
 import sys
 import textwrap
 
-import pandas as pd
-
 from pgscatalog_utils.config import set_logging_level
 from pgscatalog_utils.scorefile.effect_type import set_effect_type
 from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights
@@ -25,7 +24,10 @@ def combine_scorefiles():
     paths: list[str] = list(set(args.scorefiles))  # unique paths only
     logger.debug(f"Input scorefiles: {paths}")
 
-    scorefiles = []
+    if os.path.exists(args.outfile):
+        logger.critical(f"Output file {args.outfile} already exists")
+        raise Exception
+
     for x in paths:
         # Read scorefile df and header
         h, score = load_scorefile(x)
@@ -65,19 +67,11 @@ def combine_scorefiles():
             logger.error("Try running with --liftover and specifying the --chain_dir")
             raise Exception
 
-        scorefiles.append(score)
-
-    if len(scorefiles) > 0:
-        scorefiles: pd.DataFrame = pd.concat(scorefiles)
-    else:
-        logger.error("No valid scorefiles could be combined")
-        raise Exception
-
-    if args.liftover:
-        logger.debug("Annotating scorefiles with liftover parameters")
-        scorefiles = liftover(scorefiles, args.chain_dir, args.min_lift, args.target_build)
+        if args.liftover:
+            logger.debug("Annotating scorefile with liftover parameters")
+            score = liftover(score, args.chain_dir, args.min_lift, args.target_build)
 
-    write_scorefile(scorefiles, args.outfile)
+        write_scorefile(score, args.outfile)
 
 
 def _description_text() -> str:
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 0dd7b38..175bcab 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -1,4 +1,5 @@
 import logging
+import os
 
 import pandas as pd
 
@@ -9,6 +10,15 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None:
     cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type',
                        'is_duplicated', 'accession', 'row_nr']
 
+    if os.path.exists(path):
+        logger.debug("Output file exists: setting write mode to append")
+        write_mode = 'a'
+        header = False
+    else:
+        logger.debug("Output file doesn't exist: setting write mode to write (create new file)")
+        write_mode = 'w'
+        header = True
+
     if df.empty:
         logger.error("Empty scorefile output! Please check the input data")
         raise Exception
@@ -20,12 +30,13 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None:
         if 'other_allele' not in out_df:
             logger.warning("No other allele information detected, writing out as missing data")
             out_df['other_allele'] = None
+
         if path.endswith('.gz'):
             logger.debug("Writing out gzip-compressed combined scorefile")
-            out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip')
+            out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header)
         else:
             logger.debug("Writing out combined scorefile")
-            out_df[cols].to_csv(path, index=False, sep="\t")
+            out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header)
 
 
 def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame:

From 03699e28ce463cddb9c493117550a3cde485a0d6 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 4 Oct 2022 17:24:39 +0100
Subject: [PATCH 34/46] check if input and outputs are empty in
 combine_scorefiles

---
 .../scorefile/combine_scorefiles.py           |  8 +++++
 pgscatalog_utils/scorefile/write.py           | 30 ++++++++-----------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index e8dc610..b7adaa9 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -32,6 +32,10 @@ def combine_scorefiles():
         # Read scorefile df and header
         h, score = load_scorefile(x)
 
+        if score.empty:
+            logger.critical(f"Empty scorefile {x} detected! Please check the input data")
+            raise Exception
+
         # Check if we should use the harmonized positions
         use_harmonised = False
         current_build = None
@@ -71,6 +75,10 @@ def combine_scorefiles():
             logger.debug("Annotating scorefile with liftover parameters")
             score = liftover(score, args.chain_dir, args.min_lift, args.target_build)
 
+        if score.empty:
+            logger.critical("Empty output score detected, something went wrong while combining")
+            raise Exception
+
         write_scorefile(score, args.outfile)
 
 
diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py
index 175bcab..8a3233b 100644
--- a/pgscatalog_utils/scorefile/write.py
+++ b/pgscatalog_utils/scorefile/write.py
@@ -19,24 +19,20 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None:
         write_mode = 'w'
         header = True
 
-    if df.empty:
-        logger.error("Empty scorefile output! Please check the input data")
-        raise Exception
+    out_df: pd.DataFrame = (df.drop('accession', axis=1)
+                            .rename({'filename_prefix': 'accession'}, axis=1)
+                            .pipe(_filter_failed_liftover))
+
+    if 'other_allele' not in out_df:
+        logger.warning("No other allele information detected, writing out as missing data")
+        out_df['other_allele'] = None
+
+    if path.endswith('.gz'):
+        logger.debug("Writing out gzip-compressed combined scorefile")
+        out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header)
     else:
-        out_df: pd.DataFrame = (df.drop('accession', axis=1)
-                                .rename({'filename_prefix': 'accession'}, axis=1)
-                                .pipe(_filter_failed_liftover))
-
-        if 'other_allele' not in out_df:
-            logger.warning("No other allele information detected, writing out as missing data")
-            out_df['other_allele'] = None
-
-        if path.endswith('.gz'):
-            logger.debug("Writing out gzip-compressed combined scorefile")
-            out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header)
-        else:
-            logger.debug("Writing out combined scorefile")
-            out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header)
+        logger.debug("Writing out combined scorefile")
+        out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header)
 
 
 def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame:

From 50b1517fc54d9277357dcef9b3ca054f0db88038 Mon Sep 17 00:00:00 2001
From: smlmbrt <sam.a.lambert@gmail.com>
Date: Tue, 4 Oct 2022 17:29:30 +0100
Subject: [PATCH 35/46] Handle case where we might be removing the missing
 variants (not default)

---
 pgscatalog_utils/scorefile/combine_scorefiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
index b7adaa9..bcafa61 100644
--- a/pgscatalog_utils/scorefile/combine_scorefiles.py
+++ b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -75,7 +75,7 @@ def combine_scorefiles():
             logger.debug("Annotating scorefile with liftover parameters")
             score = liftover(score, args.chain_dir, args.min_lift, args.target_build)
 
-        if score.empty:
+        if score.empty and (args.drop_missing is False):
             logger.critical("Empty output score detected, something went wrong while combining")
             raise Exception
 

From 0a5dfbe3d2541b62bc46189d1fd6259ef1d96659 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 5 Oct 2022 17:45:15 +0100
Subject: [PATCH 36/46] add parameter for n_threads, set POLARS_MAX_THREADS
 with it

---
 pgscatalog_utils/config.py               |  6 +-----
 pgscatalog_utils/match/match_variants.py | 13 ++++++++-----
 pgscatalog_utils/match/read.py           |  4 ++--
 pgscatalog_utils/target.py               |  6 +++---
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py
index 8bb2a57..7a6b8eb 100644
--- a/pgscatalog_utils/config.py
+++ b/pgscatalog_utils/config.py
@@ -1,10 +1,6 @@
 import logging
-import os
 
-try:
-    POLARS_MAX_THREADS: int = int(os.getenv('POLARS_MAX_THREADS'))
-except TypeError:
-    POLARS_MAX_THREADS = 1  # not defined, it's better to be slow than set to n_cores (polars default)
+POLARS_MAX_THREADS = 1  # dummy value, is reset by args.n_threads (default: 1)
 
 
 def set_logging_level(verbose: bool):
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 6cc1747..698607e 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -3,11 +3,10 @@
 import os
 import sys
 import textwrap
-from glob import glob
 
 import polars as pl
 
-from pgscatalog_utils.config import set_logging_level, POLARS_MAX_THREADS
+import pgscatalog_utils.config as config
 from pgscatalog_utils.match.filter import filter_scores
 from pgscatalog_utils.match.log import make_logs
 from pgscatalog_utils.match.match import get_all_matches
@@ -19,11 +18,14 @@
 
 def match_variants():
     args = _parse_args()
+    config.set_logging_level(args.verbose)
 
-    set_logging_level(args.verbose)
-    logger.debug(f"POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}")
+    config.POLARS_MAX_THREADS = args.n_threads
+    os.environ['POLARS_MAX_THREADS'] = str(config.POLARS_MAX_THREADS)
+    # now the environment variable, parsed argument args.n_threads, and threadpool should agree
+    logger.debug(f"Setting POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}")
+    logger.debug(f"Using {config.POLARS_MAX_THREADS} threads to read CSVs")
     logger.debug(f"polars threadpool size: {pl.threadpool_size()}")
-    logger.debug(f"Using {POLARS_MAX_THREADS} threads to read CSVs")
 
     with pl.StringCache():
         scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile)
@@ -171,6 +173,7 @@ def _parse_args(args=None):
                         help='<Required> A list of paths of target genomic variants (.bim format)')
     parser.add_argument('-f', '--fast', dest='fast', action='store_true',
                         help='<Optional> Enable faster matching at the cost of increased RAM usage')
+    parser.add_argument('-n', dest='n_threads', default=1, help='<Optional> n threads for matching', type=int)
     parser.add_argument('--split', dest='split', default=False, action='store_true',
                         help='<Optional> Split scorefile per chromosome?')
     parser.add_argument('--outdir', dest='outdir', required=True,
diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index ef12543..cab5d80 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -2,8 +2,8 @@
 import logging
 
 import polars as pl
-from pgscatalog_utils.config import POLARS_MAX_THREADS
 from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target
+import pgscatalog_utils.config as config
 from pgscatalog_utils.target import Target
 
 logger = logging.getLogger(__name__)
@@ -28,7 +28,7 @@ def read_scorefile(path: str) -> pl.LazyFrame:
               'other_allele': pl.Utf8,
               'effect_type': pl.Categorical,
               'accession': pl.Categorical}
-    return (pl.read_csv(path, sep='\t', dtype=dtypes, n_threads=POLARS_MAX_THREADS)
+    return (pl.read_csv(path, sep='\t', dtype=dtypes, n_threads=config.POLARS_MAX_THREADS)
             .lazy()
             .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])).with_columns([
         pl.col("effect_allele").cast(pl.Categorical),
diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py
index fbbcb8f..ca6755c 100644
--- a/pgscatalog_utils/target.py
+++ b/pgscatalog_utils/target.py
@@ -9,7 +9,7 @@
 import polars as pl
 import zstandard
 
-from pgscatalog_utils.config import POLARS_MAX_THREADS
+import pgscatalog_utils.config as config
 
 logger = logging.getLogger(__name__)
 
@@ -68,7 +68,7 @@ def _read_compressed(self) -> pl.DataFrame:
                                     dtype=dtypes,
                                     columns=col_idxs,
                                     new_columns=new_col_names,
-                                    n_threads=POLARS_MAX_THREADS))
+                                    n_threads=config.POLARS_MAX_THREADS))
 
     def _read_uncompressed(self) -> pl.DataFrame:
         """ Read an uncompressed target as quickly as possible. Uses up to 16GB RAM on 1000 genomes pvar. """
@@ -78,7 +78,7 @@ def _read_uncompressed(self) -> pl.DataFrame:
                             dtype=dtypes,
                             columns=col_idxs,
                             new_columns=new_col_names,
-                            n_threads=POLARS_MAX_THREADS))
+                            n_threads=config.POLARS_MAX_THREADS))
 
     def _read_uncompressed_chunks(self) -> pl.DataFrame:
         """ Read a CSV using a BufferedReader in batches to reduce memory usage.

From ca1734e922e84406c03ebe518ca2101ff759c714 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 5 Oct 2022 17:47:30 +0100
Subject: [PATCH 37/46] move dropping multiallelics from preprocessing to
 labelling

---
 pgscatalog_utils/match/label.py          | 22 ++++++++++++--
 pgscatalog_utils/match/match.py          |  6 ++--
 pgscatalog_utils/match/match_variants.py | 37 +++++++++++++-----------
 pgscatalog_utils/match/preprocess.py     | 14 ++++-----
 pgscatalog_utils/match/read.py           |  6 ++--
 5 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 072fbb1..9be6316 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -18,8 +18,9 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
     labelled = (df.with_column(pl.lit(False).alias('exclude'))  # set up dummy exclude column for _label_*
                 .pipe(_label_best_match)
                 .pipe(_label_duplicate_best_match)
-                .pipe(_label_duplicate_id, keep_first_match)
-                .pipe(_label_biallelic_ambiguous, remove_ambiguous)
+                .pipe(_label_duplicate_id, params['keep_first_match'])
+                .pipe(_label_biallelic_ambiguous, params['remove_ambiguous'])
+                .pipe(_label_multiallelic, params['remove_multiallelic'])
                 .with_column(pl.lit(True).alias('match_candidate')))
 
     return _encode_match_priority(labelled)
@@ -175,3 +176,20 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra
                 .with_column(pl.max(["exclude", "exclude_ambiguous"]))
                 .drop(["exclude", "exclude_ambiguous"])
                 .rename({"max": "exclude"}))
+
+
+def _label_multiallelic(df: pl.LazyFrame, remove_multiallelic: bool) -> pl.LazyFrame:
+    """ Label multiallelic variants with exclude flag
+
+    (Multiallelic variants are already labelled with the "is_multiallelic" column in match.preprocess)
+    """
+    if remove_multiallelic:
+        logger.debug("Labelling multiallelic matches with exclude flag")
+        return df.with_column(pl.when(pl.col('is_multiallelic') == True)
+                              .then(True)
+                              .otherwise(pl.col('exclude'))  # don't overwrite existing exclude flags
+                              .alias('exclude'))
+    else:
+        logger.debug("Not excluding multiallelic variants")
+        return df
+
diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 049da3a..8f79d4c 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -11,8 +11,8 @@
 
 
 # @profile  # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling
-def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bool, remove_ambiguous: bool,
-                    keep_first_match: bool, low_memory: bool = True) -> pl.LazyFrame:
+def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, label_params: dict[str: bool],
+                    low_memory: bool = True) -> pl.LazyFrame:
     scorefile_oa = scorefile.filter(pl.col("other_allele") != None)
     scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None)
 
@@ -42,7 +42,7 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bo
         logger.debug("Collecting all matches (parallel)")
         match_lf = pl.concat(pl.collect_all(matches))
 
-    return match_lf.lazy().pipe(label_matches, remove_ambiguous, keep_first_match)
+    return match_lf.lazy().pipe(label_matches, label_params)
 
 
 def _batch_collect(matches: list[pl.LazyFrame]) -> pl.DataFrame:
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 698607e..23bce5f 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -51,19 +51,14 @@ def match_variants():
             case "single":
                 logger.debug(f"Match mode: {match_mode}")
                 # _fast_match with low_memory = True reads one target in chunks
-                matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args.remove_multiallelic,
-                                                             args.skip_flip, args.remove_ambiguous,
-                                                             args.keep_first_match, low_memory)
+                matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args, low_memory)
             case "multi":
                 logger.debug(f"Match mode: {match_mode}")  # iterate over multiple targets, in chunks
-                matches: pl.LazyFrame = _match_multiple_targets(target_paths, scorefile, args.remove_multiallelic,
-                                                                args.skip_flip, args.remove_ambiguous,
-                                                                args.keep_first_match, low_memory)
+                matches: pl.LazyFrame = _match_multiple_targets(target_paths, scorefile, args, low_memory)
             case "fast":
                 logger.debug(f"Match mode: {match_mode}")
                 # _fast_match with low_memory = False just read everything into memory for speed
-                matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args.remove_multiallelic, args.skip_flip,
-                                                    args.remove_ambiguous, args.keep_first_match, low_memory)
+                matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args, low_memory)
             case _:
                 logger.critical(f"Invalid match mode: {match_mode}")
                 raise Exception
@@ -92,26 +87,26 @@ def _check_target_chroms(target: pl.LazyFrame) -> None:
         logger.debug("Split target genome contains one chromosome (good)")
 
 
-def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, remove_multiallelic: bool,
-                skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, low_memory: bool) -> pl.LazyFrame:
+def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame,
+                args: argparse.Namespace, low_memory: bool) -> pl.LazyFrame:
     # fast match is fast because:
     #   1) all target files are read into memory without batching
     #   2) matching occurs without iterating through chromosomes
     # when low memory is true and n_targets = 1, fast match is the same as "single" match mode
-    target: pl.LazyFrame = read_target(paths=target_paths, remove_multiallelic=remove_multiallelic, low_memory=low_memory)
-    return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy()
+    params: dict[str, bool] = _make_params_dict(args)
+    target: pl.LazyFrame = read_target(paths=target_paths, low_memory=low_memory)
+    return get_all_matches(scorefile=scorefile, target=target, label_params=params, low_memory=low_memory).lazy()
 
 
-def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, remove_multiallelic: bool,
-                            skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool,
+def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, args: argparse.Namespace,
                             low_memory: bool) -> pl.LazyFrame:
     matches = []
+    params: dict[str, bool] = _make_params_dict(args)
     for i, loc_target_current in enumerate(target_paths):
         logger.debug(f'Matching scorefile(s) against target: {loc_target_current}')
-        target: pl.LazyFrame = read_target(paths=[loc_target_current], remove_multiallelic=remove_multiallelic,
-                                           low_memory=low_memory)
+        target: pl.LazyFrame = read_target(paths=[loc_target_current], low_memory=low_memory)
         _check_target_chroms(target)
-        matches.append(get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory))
+        matches.append(get_all_matches(scorefile=scorefile, target=target, label_params=params, low_memory=low_memory))
     return pl.concat(matches).lazy()
 
 
@@ -201,5 +196,13 @@ def _parse_args(args=None):
     return parser.parse_args(args)
 
 
+def _make_params_dict(args) -> dict[str, bool]:
+    """ Make a dictionary with parameters that control labelling match candidates """
+    return {'keep_first_match': args.keep_first_match,
+            'remove_ambiguous': args.remove_ambiguous,
+            'skip_flip': args.skip_flip,
+            'remove_multiallelic': args.remove_multiallelic}
+
+
 if __name__ == "__main__":
     match_variants()
diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py
index de2711f..9997176 100644
--- a/pgscatalog_utils/match/preprocess.py
+++ b/pgscatalog_utils/match/preprocess.py
@@ -37,7 +37,8 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF
     return df
 
 
-def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataFrame:
+def annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame:
+    """ Identify variants that are multiallelic with a column flag """
     # plink2 pvar multi-alleles are comma-separated
     df: pl.DataFrame = (df.with_column(
         pl.when(pl.col("ALT").str.contains(','))
@@ -46,14 +47,9 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataF
         .alias('is_multiallelic')))
 
     if (df.get_column('is_multiallelic')).any():
-        logger.debug("Multiallelic variants detected")
-        if remove_multiallelic:
-            logger.debug('Dropping multiallelic variants')
-            return df.filter(pl.col('is_multiallelic') == False)
-        else:
-            logger.debug("Exploding dataframe to handle multiallelic variants")
-            df.replace('ALT', df['ALT'].str.split(by=','))  # turn ALT to list of variants
-            return df.explode('ALT')  # expand the DF to have all the variants in different rows
+        logger.debug("Exploding dataframe to handle multiallelic variants")
+        df.replace('ALT', df['ALT'].str.split(by=','))  # turn ALT to list of variants
+        return df.explode('ALT')  # expand the DF to have all the variants in different rows
     else:
         logger.debug("No multiallelic variants detected")
         return df
diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py
index cab5d80..e7417f1 100644
--- a/pgscatalog_utils/match/read.py
+++ b/pgscatalog_utils/match/read.py
@@ -2,21 +2,21 @@
 import logging
 
 import polars as pl
-from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target
 import pgscatalog_utils.config as config
+from pgscatalog_utils.match.preprocess import annotate_multiallelic, complement_valid_alleles, filter_target
 from pgscatalog_utils.target import Target
 
 logger = logging.getLogger(__name__)
 
 
-def read_target(paths: list[str], remove_multiallelic: bool, low_memory: bool) -> pl.LazyFrame:
+def read_target(paths: list[str], low_memory: bool) -> pl.LazyFrame:
     targets: list[Target] = [Target.from_path(x, low_memory) for x in paths]
 
     logger.debug("Reading all target data complete")
     # handling multiallelic requires str methods, so don't forget to cast back or matching will break
     return (pl.concat([x.read() for x in targets])
             .pipe(filter_target)
-            .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic)
+            .pipe(annotate_multiallelic)
             .with_column(pl.col('ALT').cast(pl.Categorical))).lazy()
 
 

From 8f1f771624318bfcf9f8ce408801a7ddcc500930 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Wed, 5 Oct 2022 17:48:07 +0100
Subject: [PATCH 38/46] move skipping flips from matching to labelling

---
 pgscatalog_utils/match/label.py | 18 +++++++++++++++++-
 pgscatalog_utils/match/log.py   |  4 ++--
 pgscatalog_utils/match/match.py | 10 ++++------
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 9be6316..bc9b56c 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -7,7 +7,7 @@
 logger = logging.getLogger(__name__)
 
 
-def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.DataFrame:
+def label_matches(df: pl.DataFrame, params: dict[str, bool]) -> pl.DataFrame:
     """ Label match candidates with additional metadata. Column definitions:
 
     - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function)
@@ -21,6 +21,7 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da
                 .pipe(_label_duplicate_id, params['keep_first_match'])
                 .pipe(_label_biallelic_ambiguous, params['remove_ambiguous'])
                 .pipe(_label_multiallelic, params['remove_multiallelic'])
+                .pipe(_label_flips, params['skip_flip'])
                 .with_column(pl.lit(True).alias('match_candidate')))
 
     return _encode_match_priority(labelled)
@@ -193,3 +194,18 @@ def _label_multiallelic(df: pl.LazyFrame, remove_multiallelic: bool) -> pl.LazyF
         logger.debug("Not excluding multiallelic variants")
         return df
 
+
+def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:
+    df = df.with_column(pl.when(pl.col('match_type').str.contains('_FLIP'))
+                        .then(True)
+                        .otherwise(False)
+                        .alias('is_flipped'))
+    if skip_flip:
+        logger.debug("Labelling flipped matches with exclude flag")
+        return df.with_column(pl.when(pl.col('is_flipped') == True)
+                              .then(True)
+                              .otherwise(pl.col('exclude'))  # don't overwrite existing exclude flags
+                              .alias('exclude'))
+    else:
+        logger.debug("Not excluding flipped matches")
+        return df
\ No newline at end of file
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index ac44084..d2acf42 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -30,8 +30,8 @@ def make_summary_log(best_matches: pl.LazyFrame, filter_summary: pl.LazyFrame) -
     """ Make an aggregated table """
     logger.debug("Aggregating best match log into a summary table")
     return (best_matches
-            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate_best_match',
-                      'duplicate_ID'])
+            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'is_flipped',
+                      'duplicate_best_match', 'duplicate_ID'])
             .agg(pl.count())
             .join(filter_summary, how='left', on='accession'))
 
diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 8f79d4c..7022eea 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -24,16 +24,14 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, label_params:
     logger.debug("Getting matches for scores with effect allele and other allele")
     matches.append(_match_variants(scorefile=scorefile_oa, target=target, match_type="refalt").select(col_order))
     matches.append(_match_variants(scorefile_oa, target, match_type="altref").select(col_order))
-    if skip_flip is False:
-        matches.append(_match_variants(scorefile_oa, target, match_type="refalt_flip").select(col_order))
-        matches.append(_match_variants(scorefile_oa, target, match_type="altref_flip").select(col_order))
+    matches.append(_match_variants(scorefile_oa, target, match_type="refalt_flip").select(col_order))
+    matches.append(_match_variants(scorefile_oa, target, match_type="altref_flip").select(col_order))
 
     logger.debug("Getting matches for scores with effect allele only")
     matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref").select(col_order))
     matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt").select(col_order))
-    if skip_flip is False:
-        matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order))
-        matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order))
+    matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order))
+    matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order))
 
     if low_memory:
         logger.debug("Batch collecting matches (low memory mode)")

From f5b64cfd9a1f3ec428840d4fd9034d463b70dcd7 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Thu, 6 Oct 2022 16:09:03 +0100
Subject: [PATCH 39/46] is_flipped -> match_flipped, fix uppercase match type

---
 pgscatalog_utils/match/label.py | 6 +++---
 pgscatalog_utils/match/log.py   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index bc9b56c..357e3f6 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -196,13 +196,13 @@ def _label_multiallelic(df: pl.LazyFrame, remove_multiallelic: bool) -> pl.LazyF
 
 
 def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:
-    df = df.with_column(pl.when(pl.col('match_type').str.contains('_FLIP'))
+    df = df.with_column(pl.when(pl.col('match_type').str.contains('_flip'))
                         .then(True)
                         .otherwise(False)
-                        .alias('is_flipped'))
+                        .alias('match_flipped'))
     if skip_flip:
         logger.debug("Labelling flipped matches with exclude flag")
-        return df.with_column(pl.when(pl.col('is_flipped') == True)
+        return df.with_column(pl.when(pl.col('match_flipped') == True)
                               .then(True)
                               .otherwise(pl.col('exclude'))  # don't overwrite existing exclude flags
                               .alias('exclude'))
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index d2acf42..6143308 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -30,7 +30,7 @@ def make_summary_log(best_matches: pl.LazyFrame, filter_summary: pl.LazyFrame) -
     """ Make an aggregated table """
     logger.debug("Aggregating best match log into a summary table")
     return (best_matches
-            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'is_flipped',
+            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'match_flipped',
                       'duplicate_best_match', 'duplicate_ID'])
             .agg(pl.count())
             .join(filter_summary, how='left', on='accession'))

From 5b18299540a87645adc528fb99df102634a2a1ec Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 7 Oct 2022 11:23:07 +0100
Subject: [PATCH 40/46] move label_matches from get_all_matches to
 match_variants

---
 pgscatalog_utils/match/label.py          | 17 +++++++++--------
 pgscatalog_utils/match/match.py          |  7 ++-----
 pgscatalog_utils/match/match_variants.py |  9 ++++++---
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
index 357e3f6..1c55ba3 100644
--- a/pgscatalog_utils/match/label.py
+++ b/pgscatalog_utils/match/label.py
@@ -7,7 +7,7 @@
 logger = logging.getLogger(__name__)
 
 
-def label_matches(df: pl.DataFrame, params: dict[str, bool]) -> pl.DataFrame:
+def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame:
     """ Label match candidates with additional metadata. Column definitions:
 
     - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function)
@@ -15,6 +15,7 @@ def label_matches(df: pl.DataFrame, params: dict[str, bool]) -> pl.DataFrame:
     - duplicate: True if more than one best match exists for the same accession and ID
     - ambiguous: True if ambiguous
     """
+    assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip'}
     labelled = (df.with_column(pl.lit(False).alias('exclude'))  # set up dummy exclude column for _label_*
                 .pipe(_label_best_match)
                 .pipe(_label_duplicate_best_match)
@@ -27,7 +28,7 @@ def label_matches(df: pl.DataFrame, params: dict[str, bool]) -> pl.DataFrame:
     return _encode_match_priority(labelled)
 
 
-def _encode_match_priority(df: pl.DataFrame) -> pl.DataFrame:
+def _encode_match_priority(df: pl.LazyFrame) -> pl.LazyFrame:
     """ Encode a new column called match status containing matched, unmatched, excluded, and not_best """
     return (df.with_columns([
         # set false best match to not_best
@@ -41,7 +42,7 @@ def _encode_match_priority(df: pl.DataFrame) -> pl.DataFrame:
                          .cast(pl.Categorical)).drop(["max", "excluded_match_priority", "match_priority"]))
 
 
-def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
+def _label_best_match(df: pl.LazyFrame) -> pl.LazyFrame:
     """ Best matches have the lowest match priority type. Find the best matches and label them.  """
     logger.debug("Labelling best match type (refalt > altref > ...)")
     match_priority = {'refalt': 0, 'altref': 1, 'refalt_flip': 2, 'altref_flip': 3, 'no_oa_ref': 4, 'no_oa_alt': 5,
@@ -50,7 +51,7 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
 
     # use a groupby aggregation to guarantee the number of rows stays the same
     # rows were being lost using an anti join + reduce approach
-    prioritised: pl.DataFrame = (df.with_column(pl.col('match_type')
+    prioritised: pl.LazyFrame = (df.with_column(pl.col('match_type')
                                                 .apply(lambda x: match_priority[x])
                                                 .alias('match_priority'))
                                  .with_column(pl.col("match_priority")
@@ -66,7 +67,7 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame:
     return prioritised.drop(['match_priority', 'best_match_type'])
 
 
-def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame:
+def _label_duplicate_best_match(df: pl.LazyFrame) -> pl.LazyFrame:
     """ A scoring file row_nr in an accession group can be duplicated if a target position has different REF, e.g.:
 
     ┌────────┬────────────────────────┬────────────┬────────────────┬─────┬────────────┐
@@ -82,7 +83,7 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame:
     Label the first row with best_match = true, and duplicate rows with best_match = false
     """
     logger.debug("Labelling duplicated best match: keeping first instance as best_match = True")
-    labelled: pl.DataFrame = (df.with_column(pl.col('best_match')
+    labelled: pl.LazyFrame = (df.with_column(pl.col('best_match')
                                              .count()
                                              .over(['accession', 'row_nr', 'best_match'])
                                              .alias('count'))
@@ -106,7 +107,7 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame:
     return labelled
 
 
-def _label_duplicate_id(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame:
+def _label_duplicate_id(df: pl.LazyFrame, keep_first_match: bool) -> pl.LazyFrame:
     """ Label best match duplicates made when the scoring file is remapped to a different genome build
 
     ┌─────────┬────────────────────────┬─────────────┬────────────────┬─────┬────────────┐
@@ -153,7 +154,7 @@ def _label_duplicate_id(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFram
             .rename({"max": "exclude"}))
 
 
-def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame:
+def _label_biallelic_ambiguous(df: pl.LazyFrame, remove_ambiguous) -> pl.LazyFrame:
     logger.debug("Labelling ambiguous variants")
     ambig = ((df.with_columns([
         pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str),
diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py
index 7022eea..4363dd5 100644
--- a/pgscatalog_utils/match/match.py
+++ b/pgscatalog_utils/match/match.py
@@ -5,14 +5,11 @@
 
 import polars as pl
 
-from pgscatalog_utils.match.label import label_matches
-
 logger = logging.getLogger(__name__)
 
 
 # @profile  # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling
-def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, label_params: dict[str: bool],
-                    low_memory: bool = True) -> pl.LazyFrame:
+def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, low_memory: bool = True) -> pl.LazyFrame:
     scorefile_oa = scorefile.filter(pl.col("other_allele") != None)
     scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None)
 
@@ -40,7 +37,7 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, label_params:
         logger.debug("Collecting all matches (parallel)")
         match_lf = pl.concat(pl.collect_all(matches))
 
-    return match_lf.lazy().pipe(label_matches, label_params)
+    return match_lf.lazy()
 
 
 def _batch_collect(matches: list[pl.LazyFrame]) -> pl.DataFrame:
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 23bce5f..1fc322d 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -8,6 +8,7 @@
 
 import pgscatalog_utils.config as config
 from pgscatalog_utils.match.filter import filter_scores
+from pgscatalog_utils.match.label import label_matches
 from pgscatalog_utils.match.log import make_logs
 from pgscatalog_utils.match.match import get_all_matches
 from pgscatalog_utils.match.read import read_target, read_scorefile
@@ -95,7 +96,8 @@ def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame,
     # when low memory is true and n_targets = 1, fast match is the same as "single" match mode
     params: dict[str, bool] = _make_params_dict(args)
     target: pl.LazyFrame = read_target(paths=target_paths, low_memory=low_memory)
-    return get_all_matches(scorefile=scorefile, target=target, label_params=params, low_memory=low_memory).lazy()
+    return (get_all_matches(scorefile=scorefile, target=target, low_memory=low_memory)
+            .pipe(label_matches, params=params))
 
 
 def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, args: argparse.Namespace,
@@ -106,8 +108,9 @@ def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, ar
         logger.debug(f'Matching scorefile(s) against target: {loc_target_current}')
         target: pl.LazyFrame = read_target(paths=[loc_target_current], low_memory=low_memory)
         _check_target_chroms(target)
-        matches.append(get_all_matches(scorefile=scorefile, target=target, label_params=params, low_memory=low_memory))
-    return pl.concat(matches).lazy()
+        matches.append(get_all_matches(scorefile=scorefile, target=target, low_memory=low_memory))
+    return (pl.concat(matches)
+            .pipe(label_params=params))
 
 
 def _description_text() -> str:

From df44d9becaa5bc19add8fb81f32838c754239e96 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 7 Oct 2022 11:23:28 +0100
Subject: [PATCH 41/46] fix setting n_threads when reading

---
 pgscatalog_utils/target.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py
index ca6755c..3573ee6 100644
--- a/pgscatalog_utils/target.py
+++ b/pgscatalog_utils/target.py
@@ -106,7 +106,7 @@ def _read_uncompressed_chunks(self) -> pl.DataFrame:
                                  dtype=dtypes,
                                  columns=col_idxs,
                                  new_columns=new_col_names,
-                                 n_threads=POLARS_MAX_THREADS).write_ipc(out_path))
+                                 n_threads=config.POLARS_MAX_THREADS).write_ipc(out_path))
                     batch_n += 1
 
             gc.collect()  # just to be safe
@@ -146,7 +146,7 @@ def _read_compressed_chunks(self) -> pl.DataFrame:
                                  dtype=dtypes,
                                  columns=columns,
                                  new_columns=new_col_names,
-                                 n_threads=POLARS_MAX_THREADS)
+                                 n_threads=config.POLARS_MAX_THREADS)
                      .write_ipc(out_path))
 
                     chunk_buffer = b''.join([chunk_buffer, chunk[end:]])

From be96d14d325567c6fcff8f7601c5617fd147ba3f Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Fri, 7 Oct 2022 11:23:36 +0100
Subject: [PATCH 42/46] fix tests

---
 tests/match/test_label.py | 86 ++++++++++++++++++++++++++-------------
 tests/match/test_match.py | 51 ++++++++++++++++-------
 2 files changed, 94 insertions(+), 43 deletions(-)

diff --git a/tests/match/test_label.py b/tests/match/test_label.py
index bf354bd..ebe0c43 100644
--- a/tests/match/test_label.py
+++ b/tests/match/test_label.py
@@ -4,6 +4,7 @@
 import pytest
 import polars as pl
 
+from pgscatalog_utils.match.label import label_matches
 from pgscatalog_utils.match.match import get_all_matches
 from tests.match.test_match import _cast_cat
 
@@ -29,37 +30,54 @@ def test_label(small_scorefile, small_target):
     scorefile, target = _cast_cat(small_scorefile, small_target)
 
     # get_all_matches calls label_matches
-    labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False).collect()
+    params = {'skip_flip': True, 'remove_ambiguous': True, 'remove_multiallelic': False, 'keep_first_match': False}
+    labelled: pl.DataFrame = (get_all_matches(scorefile=scorefile, target=target)
+                              .pipe(label_matches, params=params)
+                              .collect())
 
     logger.debug(labelled.select(['ID', 'match_type', 'best_match', 'ambiguous', 'match_status', 'exclude']))
 
-    assert labelled['best_match'].to_list() == [True, True, True]
-    assert labelled['ambiguous'].to_list() == [False, True, False]
-    assert labelled['exclude'].to_list() == [False, True, False]
-    assert labelled['match_status'].to_list() == ["matched", "excluded", "matched"]
+    assert labelled['best_match'].to_list() == [True, True, True, False]
+    assert labelled['ambiguous'].to_list() == [False, True, False, True]
+    assert labelled['exclude'].to_list() == [False, True, False, True]
+    assert labelled['match_status'].to_list() == ["matched", "excluded", "matched", "not_best"]
 
 
 def test_ambiguous_label(small_flipped_scorefile, small_target):
     """ Test ambiguous variant labels change when they're kept for match candidates with one match per position """
     scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
-
-    no_ambiguous = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False).collect()
-
-    assert no_ambiguous['best_match'].to_list() == [True]
-    assert no_ambiguous['ambiguous'].to_list() == [True]
-    assert no_ambiguous['exclude'].to_list() == [True]
-    assert no_ambiguous['match_status'].to_list() == ["excluded"]
+    no_flip = {'skip_flip': True, 'remove_ambiguous': True, 'remove_multiallelic': False, 'keep_first_match': False}
+    no_ambiguous: pl.DataFrame = (get_all_matches(scorefile=scorefile, target=target)
+                                  .pipe(label_matches, params=no_flip)
+                                  .collect())
+
+    # 2:2:T:A -> refalt      -> ambiguous     -> excluded (best match but ambiguous)
+    # 1:1:A:C -> refalt_flip -> not ambiguous -> excluded (best match but skip_flip)
+    # 2:2:T:A -> refalt_flip -> ambiguous     -> not_best (refalt priority so not best and excluded)
+    # 3:3:T:G -> refalt_flip -> not ambiguous -> excluded (best match but skip_flip)
+    assert no_ambiguous['best_match'].to_list() == [True, True, False, True]
+    assert no_ambiguous['ambiguous'].to_list() == [True, False, True, False]
+    assert no_ambiguous['exclude'].to_list() == [True, True, True, True]
+    assert no_ambiguous['match_status'].to_list() == ["excluded", "excluded", "not_best", "excluded"]
 
     # otherwise, ambiguous variants are kept
-    labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False).collect()
-
-    assert labelled['best_match'].to_list() == [True]
-    assert labelled['ambiguous'].to_list() == [True]
-    assert labelled['exclude'].to_list() == [False]
-    assert labelled['match_status'].to_list() == ["matched"]
-
-
-def test_duplicate_best_match(duplicated_matches, request):
+    flip_params = {'skip_flip': True, 'remove_ambiguous': False, 'remove_multiallelic': False,
+                   'keep_first_match': False}
+    labelled = (get_all_matches(scorefile=scorefile, target=target)
+                .pipe(label_matches, params=flip_params)
+                .collect())
+
+    # 2:2:T:A -> refalt      -> ambiguous     -> matched
+    # 1:1:A:C -> refalt_flip -> not ambiguous -> excluded (best match but skip_flip)
+    # 2:2:T:A -> refalt_flip -> ambiguous     -> not_best (refalt priority so not best and excluded)
+    # 3:3:T:G -> refalt_flip -> not ambiguous -> excluded (best match but skip_flip)
+    assert labelled['best_match'].to_list() == [True, True, False, True]
+    assert labelled['ambiguous'].to_list() == [True, False, True, False]
+    assert labelled['exclude'].to_list() ==  [False, True, True, True]
+    assert labelled['match_status'].to_list() == ["matched", "excluded", "not_best", "excluded"]
+
+
+def test_duplicate_ID(duplicated_matches, request):
     # these matches come from different lines in the original scoring file
     assert duplicated_matches["row_nr"].to_list() == [1, 4]
     # but they have the same ID!
@@ -94,7 +112,7 @@ def test_duplicate_best_match(duplicate_best_match):
 
 
 @pytest.fixture(params=[True, False], ids=["keep_first_match", "delete_both"])
-def duplicated_matches(small_scorefile, small_target, request):
+def duplicated_matches(small_scorefile, small_target, request) -> pl.DataFrame:
     # pgs catalog scorefiles can contain the same variant remapped to multiple rows
     # this happens after liftover to a different genome build
     # row_nrs will be different, but other information may be the same
@@ -105,21 +123,33 @@ def duplicated_matches(small_scorefile, small_target, request):
 
     scorefile, target = _cast_cat(dups, small_target)
 
-    return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=request.param).collect()
+    params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False,
+              'keep_first_match': request.param}
+    return (get_all_matches(scorefile=scorefile, target=target)
+            .pipe(label_matches, params=params)
+            .collect())
 
 
 @pytest.fixture
-def multiple_match_types(small_target, small_scorefile):
+def multiple_match_types(small_target, small_scorefile) -> pl.DataFrame:
     # skip flip will return two candidate matches for one target position: refalt + refalt_flip
     scorefile, target = _cast_cat(small_scorefile, small_target)
-    return (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
-            .filter(pl.col('chr_name') == '2')).collect()
+
+    params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False}
+    return (get_all_matches(scorefile=scorefile, target=target)
+            .pipe(label_matches, params=params)
+            .filter(pl.col('chr_name') == '2')
+            .collect())
 
 
 @pytest.fixture
-def duplicate_best_match(small_target, small_scorefile_no_oa):
+def duplicate_best_match(small_target, small_scorefile_no_oa) -> pl.DataFrame:
     # this type of target genome can sometimes occur when the REF is different at the same position
     odd_target = {'#CHROM': [1, 1], 'POS': [1, 1], 'REF': ['T', 'C'], 'ALT': ['A', 'A'], 'ID': ['1:1:T:C', '1:1:A:A'],
                   'is_multiallelic': [False, False]}
     scorefile, target = _cast_cat(small_scorefile_no_oa, pl.DataFrame(odd_target))
-    return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False).collect()
+
+    params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False}
+    return (get_all_matches(scorefile=scorefile, target=target)
+            .pipe(label_matches, params=params)
+            .collect())
diff --git a/tests/match/test_match.py b/tests/match/test_match.py
index b8fbb07..ca509d6 100644
--- a/tests/match/test_match.py
+++ b/tests/match/test_match.py
@@ -5,6 +5,7 @@
 import polars as pl
 import pytest
 
+from pgscatalog_utils.match.label import label_matches
 from pgscatalog_utils.match.match import get_all_matches
 from pgscatalog_utils.match.match_variants import match_variants
 
@@ -60,15 +61,21 @@ def _cast_cat(scorefile, target) -> tuple[pl.LazyFrame, pl.LazyFrame]:
 def test_match_strategies(small_scorefile, small_target):
     scorefile, target = _cast_cat(small_scorefile, small_target)
 
+    params = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False}
     # check unambiguous matches
-    df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
-          .filter(pl.col('ambiguous') == False)).collect()
+    df: pl.DataFrame = (get_all_matches(scorefile, target)
+                        .pipe(label_matches, params=params)
+                        .filter(pl.col('ambiguous') == False)
+                        .collect())
     assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'})
     assert set(df['match_type'].to_list()).issubset(['altref', 'refalt'])
 
     # when keeping ambiguous and flipping alleles
-    flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
-            .filter(pl.col('ambiguous') == True)).collect()
+    flip_params = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False}
+    flip: pl.DataFrame = (get_all_matches(scorefile, target)
+                          .pipe(label_matches, params=flip_params)
+                          .filter(pl.col('ambiguous') == True)
+                          .collect())
 
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'altref', 'refalt_flip'})
@@ -77,28 +84,42 @@ def test_match_strategies(small_scorefile, small_target):
 def test_no_oa_match(small_scorefile_no_oa, small_target):
     scorefile, target = _cast_cat(small_scorefile_no_oa, small_target)
 
-    df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False)
-          .filter(pl.col('ambiguous') == False)).collect()
+    no_ambig = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False}
+    df: pl.DataFrame = (get_all_matches(scorefile, target)
+                        .pipe(label_matches, params=no_ambig)
+                        .filter(pl.col('ambiguous') == False)
+                        .collect())
 
     assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
     assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref'])
 
     # check ambiguous matches
-    flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
-            .filter(pl.col('ambiguous') == True)).collect()
+    ambig = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False}
+    flip: pl.DataFrame = (get_all_matches(scorefile, target)
+                          .pipe(label_matches, ambig)
+                          .filter(pl.col('ambiguous') == True)
+                          .collect())
     assert set(flip['ID'].to_list()).issubset({'2:2:T:A'})
     assert set(flip['match_type'].to_list()).issubset({'no_oa_alt', 'no_oa_ref_flip'})
 
 
 def test_flip_match(small_flipped_scorefile, small_target):
     scorefile, target = _cast_cat(small_flipped_scorefile, small_target)
-
-    df = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False).collect()
-    assert set(df['ambiguous']) == {True}
-    assert set(df['match_type']) == {'refalt'}
-
-    flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False)
-            .filter(pl.col('ambiguous') == False)).collect()
+    params = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False}
+    df: pl.DataFrame = (get_all_matches(scorefile, target)
+                        .pipe(label_matches, params=params)
+                        .collect())
+
+    assert df['ambiguous'].to_list() == [True, False, True, False]
+    assert df['match_type'].to_list() == ['refalt', 'refalt_flip', 'altref_flip', 'altref_flip']
+    assert df['match_status'].to_list() == ['matched', 'excluded', 'not_best', 'excluded']  # flipped -> excluded
+
+    no_flip_params = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False,
+                      'remove_multiallelic': False}
+    flip: pl.DataFrame = (get_all_matches(scorefile, target)
+                          .pipe(label_matches, params=no_flip_params)
+                          .filter(pl.col('ambiguous') == False)
+                          .collect())
     assert flip['match_type'].str.contains('flip').all()
     assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C'])
 

From d5cfcf0fae63e34dd2fa799ab678fb5c2535be6b Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 10 Oct 2022 16:34:08 +0100
Subject: [PATCH 43/46] add sort by match type

---
 pgscatalog_utils/match/log.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 6143308..978049c 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -51,7 +51,7 @@ def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame:
                  "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"]
     pretty_df = (df.select(keep_cols)
                  .select(pl.exclude("^.*_right"))
-                 .sort(["accession", "row_nr", "chr_name", "chr_position"]))
+                 .sort(["accession", "row_nr", "chr_name", "chr_position", "match_type"]))
     return pretty_df
 
 

From 126f153ff2b9fa14e4248482460988721d0a7537 Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 10 Oct 2022 16:34:17 +0100
Subject: [PATCH 44/46] fix _match_multiple_targets

---
 pgscatalog_utils/match/match_variants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
index 1fc322d..380e71c 100644
--- a/pgscatalog_utils/match/match_variants.py
+++ b/pgscatalog_utils/match/match_variants.py
@@ -110,7 +110,7 @@ def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, ar
         _check_target_chroms(target)
         matches.append(get_all_matches(scorefile=scorefile, target=target, low_memory=low_memory))
     return (pl.concat(matches)
-            .pipe(label_params=params))
+            .pipe(label_matches, params=params))
 
 
 def _description_text() -> str:

From 6aee17056bc93d0460ecc2a129e5223ca474a24e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Mon, 10 Oct 2022 16:38:06 +0100
Subject: [PATCH 45/46] oops

---
 pgscatalog_utils/match/log.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
index 978049c..5b74517 100644
--- a/pgscatalog_utils/match/log.py
+++ b/pgscatalog_utils/match/log.py
@@ -51,7 +51,7 @@ def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame:
                  "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"]
     pretty_df = (df.select(keep_cols)
                  .select(pl.exclude("^.*_right"))
-                 .sort(["accession", "row_nr", "chr_name", "chr_position", "match_type"]))
+                 .sort(["accession", "row_nr", "chr_name", "chr_position", "match_status"]))
     return pretty_df
 
 

From 4a213936e4c9c4ed91a54e2b19584f1d9f0f967b Mon Sep 17 00:00:00 2001
From: Sam Lambert <sam.a.lambert@gmail.com>
Date: Tue, 11 Oct 2022 11:34:31 +0100
Subject: [PATCH 46/46] Update pyproject.toml

Add Laurent
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ef69c8f..18de317 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "pgscatalog_utils"
 version = "0.2.0"
 description = "Utilities for working with PGS Catalog API and scoring files"
 homepage = "https://github.com/PGScatalog/pgscatalog_utils"
-authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>"]
+authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>", "Laurent Gil <lg10@sanger.ac.uk>"]
 license = "Apache-2.0"
 readme = "README.md"