From 77098befc3a1d692292fbbffb0ff977357c116b4 Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Fri, 2 Sep 2022 13:16:03 +0100 Subject: [PATCH 01/46] Fist commit of the 'validate' utils --- pgscatalog_utils/validate/__init__.py | 0 pgscatalog_utils/validate/common_constants.py | 44 +++ .../validate/formatted/__init__.py | 0 .../validate/formatted/validator.py | 230 +++++++++++ .../validate/harmonized_position/__init__.py | 0 .../validate/harmonized_position/validator.py | 137 +++++++ pgscatalog_utils/validate/helpers.py | 29 ++ pgscatalog_utils/validate/schemas.py | 158 ++++++++ .../validate/validate_scorefile.py | 203 ++++++++++ pgscatalog_utils/validate/validator_base.py | 364 ++++++++++++++++++ 10 files changed, 1165 insertions(+) create mode 100644 pgscatalog_utils/validate/__init__.py create mode 100644 pgscatalog_utils/validate/common_constants.py create mode 100644 pgscatalog_utils/validate/formatted/__init__.py create mode 100644 pgscatalog_utils/validate/formatted/validator.py create mode 100644 pgscatalog_utils/validate/harmonized_position/__init__.py create mode 100644 pgscatalog_utils/validate/harmonized_position/validator.py create mode 100644 pgscatalog_utils/validate/helpers.py create mode 100644 pgscatalog_utils/validate/schemas.py create mode 100644 pgscatalog_utils/validate/validate_scorefile.py create mode 100644 pgscatalog_utils/validate/validator_base.py diff --git a/pgscatalog_utils/validate/__init__.py b/pgscatalog_utils/validate/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog_utils/validate/common_constants.py b/pgscatalog_utils/validate/common_constants.py new file mode 100644 index 0000000..768752a --- /dev/null +++ b/pgscatalog_utils/validate/common_constants.py @@ -0,0 +1,44 @@ +SNP_DSET = 'rsID' +CHR_DSET = 'chr_name' +BP_DSET = 'chr_position' +EFFECT_DSET = 'effect_allele' +OTH_DSET = 'other_allele' +EFFECT_WEIGHT_DSET = 'effect_weight' + +# Other columns +LOCUS_DSET = 'locus_name' +OR_DSET = 'OR' +HR_DSET = 'HR' +BETA_DSET = 'beta' +FREQ_DSET = 'allelefrequency_effect' +FLAG_INTERACTION_DSET = 'is_interaction' +FLAG_RECESSIVE_DSET = 'is_recessive' +FLAG_HAPLOTYPE_DSET = 'is_haplotype' +FLAG_DIPLOTYPE_DSET = 'is_diplotype' +METHOD_DSET = 'imputation_method' +SNP_DESC_DSET = 'variant_description' +INCLUSION_DSET = 'inclusion_criteria' +DOSAGE_0_WEIGHT = 'dosage_0_weight' +DOSAGE_1_WEIGHT = 'dosage_1_weight' +DOSAGE_2_WEIGHT = 'dosage_2_weight' +# hmPOS +HM_SOURCE_DSET = 'hm_source' +HM_SNP_DSET = 'hm_rsID' +HM_CHR_DSET = 'hm_chr' +HM_BP_DSET = 'hm_pos' +HM_OTH_DSET = 'hm_inferOtherAllele' +HM_MATCH_CHR_DSET = 'hm_match_chr' +HM_MATCH_BP_DSET = 'hm_match_pos' +# hmFinal +VARIANT_DSET = 'variant_id' +HM_CODE_DSET = 'hm_code' +HM_INFO_DSET = 'hm_info' + + +DSET_TYPES = {SNP_DSET: str, CHR_DSET: str, BP_DSET: int, EFFECT_DSET: str, OTH_DSET: str, + EFFECT_WEIGHT_DSET: float, VARIANT_DSET: str, HM_CODE_DSET: int, HM_INFO_DSET: str, LOCUS_DSET: str, OR_DSET: float, HR_DSET: float, BETA_DSET: float, FREQ_DSET: float, + FLAG_INTERACTION_DSET: str, FLAG_RECESSIVE_DSET: str, FLAG_HAPLOTYPE_DSET: str, FLAG_DIPLOTYPE_DSET: str, + METHOD_DSET: str, SNP_DESC_DSET: str, INCLUSION_DSET: str, DOSAGE_0_WEIGHT: float, DOSAGE_1_WEIGHT: float, DOSAGE_2_WEIGHT: float, + HM_SOURCE_DSET:str, HM_SNP_DSET: str, HM_CHR_DSET: str, HM_BP_DSET: int, HM_OTH_DSET: str, HM_MATCH_CHR_DSET: str, HM_MATCH_BP_DSET: int} + +TO_DISPLAY_ORDER = [ SNP_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTH_DSET, EFFECT_WEIGHT_DSET, LOCUS_DSET, OR_DSET, HR_DSET, HM_CODE_DSET, HM_INFO_DSET, HM_SOURCE_DSET, HM_SNP_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET] \ No newline at end of file diff --git a/pgscatalog_utils/validate/formatted/__init__.py b/pgscatalog_utils/validate/formatted/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog_utils/validate/formatted/validator.py b/pgscatalog_utils/validate/formatted/validator.py new file mode 100644 index 0000000..0a5ed3b --- /dev/null +++ b/pgscatalog_utils/validate/formatted/validator.py @@ -0,0 +1,230 @@ +import gzip +import re +from pandas_schema import Schema +from pgscatalog_utils.validate.schemas import * +from pgscatalog_utils.validate.validator_base import * +# from schemas import * +# from validator_base import * + +''' +PGS Catalog Harmonized file validator +- using pandas_schema https://github.com/TMiguelT/PandasSchema +''' + +class ValidatorFormatted(ValidatorBase): + + def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): + super().__init__(file, score_dir, logfile, error_limit) + self.score_dir=None + self.meta_format = FORMATTED_META_GENERIC + self.validators = FORMATTED_VALIDATORS + self.valid_cols = VALID_COLS_FORMATTED + self.valid_type = VALID_TYPE_FORMATTED + + + def extract_specific_metadata(self,line): + ''' Extract some of the metadata. ''' + match_variants_number = re.search(r'#variants_number=(\d+)', line) + if match_variants_number: + self.variants_number = int(match_variants_number.group(1)) + + + def get_and_check_variants_number(self): + ''' Verify that the number of variant lines corresponds to the number of variants in the headers ''' + variant_lines = 0 + + with gzip.open( self.file, 'rb') as f: + line_number = 0 + for line in f: + line_number += 1 + line = line.decode('utf-8').rstrip() + if line.startswith('#'): + match_variants_number = re.search(r'#variants_number=(\d+)', line) + if match_variants_number: + self.variants_number = int(match_variants_number.group(1)) + else: + variant_lines += 1 + if re.search('\w+', line): # Line not empty + cols = line.split(self.sep) + has_trailing_spaces = self.check_leading_trailing_spaces(cols,line_number) + if has_trailing_spaces: + self.global_errors += 1 + else: + self.logger.error(f'- Line {line_number} is empty') + self.global_errors += 1 + + if self.variants_number: + variant_lines -= 1 # Remove the header line from the count + if self.variants_number != variant_lines: + self.logger.error(f'- The number of variants lines in the file ({variant_lines}) and the number of variants declared in the headers ({self.variants_number}) are different') + self.global_errors += 1 + else: + self.logger.error("- Can't retrieve the number of variants from the headers") + self.global_errors += 1 + + + def detect_duplicated_rows(self,dataframe_chunk): + ''' Detect duplicated rows in the scoring file. ''' + # Columns of interest to compare the different rows + cols_sel = [] + for col in ['rsID','chr_name','chr_position','effect_allele','other_allele']: + if col in self.cols_to_validate: + cols_sel.append(col) + + duplicate_status = dataframe_chunk.duplicated(cols_sel) + if any(duplicate_status): + duplicated_rows = dataframe_chunk[duplicate_status] + self.logger.error(f'Duplicated row(s) found: {len(duplicated_rows.index)}\n\t-> {duplicated_rows.to_string(header=False,index=False)}') + self.global_errors += 1 + for index in duplicated_rows.index: + self.bad_rows.append(index) + + + def validate_data(self): + if not self.open_file_and_check_for_squareness(): + self.logger.error("Please fix the table. Some rows have different numbers of columns to the header") + self.logger.info("Rows with different numbers of columns to the header are not validated") + # Check the consitence between the declared variants number and the actual number of variants in the file + self.get_and_check_variants_number() + + for chunk in self.df_iterator(self.file): + to_validate = chunk[self.cols_to_read] + to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded + + # Detect duplicated rows + self.detect_duplicated_rows(to_validate) + # validate the snp column if present + if SNP_DSET in self.header: + if CHR_DSET and BP_DSET in self.header: + self.schema = Schema([FORMATTED_VALIDATORS_SNP_EMPTY[h] for h in self.cols_to_validate]) + else: + self.schema = Schema([FORMATTED_VALIDATORS_SNP[h] for h in self.cols_to_validate]) + errors = self.schema.validate(to_validate) + self.store_errors(errors) + + if CHR_DSET and BP_DSET in self.header: + self.schema = Schema([FORMATTED_VALIDATORS_POS[h] for h in self.cols_to_validate]) + errors = self.schema.validate(to_validate) + self.store_errors(errors) + if OR_DSET in self.header: + self.schema = Schema([FORMATTED_VALIDATORS_OR[h] for h in self.cols_to_validate]) + errors = self.schema.validate(to_validate) + self.store_errors(errors) + if HR_DSET in self.header: + self.schema = Schema([FORMATTED_VALIDATORS_HR[h] for h in self.cols_to_validate]) + errors = self.schema.validate(to_validate) + self.store_errors(errors) + self.process_errors() + if len(self.bad_rows) >= self.error_limit: + break + if not self.bad_rows and not self.global_errors: + self.logger.info("File is valid") + return True + + else: + self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit)) + return False + + + def validate_filename(self): + filename = self.file.split('/')[-1].split('.')[0] + if re.match('^PGS\d{6}$', filename): + return True + else: + self.logger.error("Filename: {} should follow the pattern 'PGSXXXXXX.txt.gz', where the 'X' are the 6 digits of the PGS identifier (e.g. PGS000001)".format(filename)) + return False + + + def validate_headers(self): + self.setup_field_validation() + self.detect_genomebuild_with_rsid() + required_is_subset = set(STD_COLS_VAR_FORMATTED).issubset(self.header) + if not required_is_subset: + # check if everything but snp: + required_is_subset = set(CHR_COLS_VAR_FORMATTED).issubset(self.header) + if not required_is_subset: + required_is_subset = set(SNP_COLS_VAR_FORMATTED).issubset(self.header) + if not required_is_subset: + self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_FORMATTED, self.header)) + + # Check if at least one of the effect columns is there + has_effect_col = 0 + for col in STD_COLS_EFFECT_FORMATTED: + if set([col]).issubset(self.header): + has_effect_col = 1 + break + if not has_effect_col: + self.logger.error("Required headers: at least one of the columns '{}' must be in the file header: {}".format(STD_COLS_EFFECT_FORMATTED, self.header)) + required_is_subset = None + + return required_is_subset + + + def detect_genomebuild_with_rsid(self): + ''' The column "rsID" should always be in the scoring file when the genome build is not reported (i.e. "NR") ''' + self.get_genomebuild() + if self.genomebuild == 'NR': + if SNP_DSET not in self.header: + self.logger.error(f"- The combination: Genome Build = '{self.genomebuild}' & the missing column '{SNP_DSET}' in the header is not allowed as we have to manually guess the genome build.") + self.global_errors += 1 + + + def get_genomebuild(self): + ''' Retrieve the Genome Build from the comments ''' + with gzip.open(self.file, 'rb') as f_in: + for f_line in f_in: + line = f_line.decode() + # Update header + if line.startswith('#genome_build'): + gb = (line.split('='))[1] + self.genomebuild = gb.strip() + return + + +################################################################## + +def init_validator(file, logfile, score_dir=None) -> ValidatorFormatted: + validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile) + return validator + +# def run_validator(file, check_filename, logfile, score_dir=None): + +# validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile) + +# validator.logger.propagate = False + +# if not file or not logfile: +# validator.logger.info("Missing file and/or logfile") +# validator.logger.info("Exiting before any further checks") +# sys.exit() +# if not os.path.exists(file): +# validator.logger.info("Error: the file '"+file+"' can't be found") +# validator.logger.info("Exiting before any further checks") +# sys.exit() + +# is_ok_to_run_validation = 1 +# validator.logger.info("Validating file extension...") +# if not validator.validate_file_extension(): +# validator.logger.info("Invalid file extension: {}".format(file)) +# validator.logger.info("Exiting before any further checks") +# is_ok_to_run_validation = 0 + +# if is_ok_to_run_validation and check_filename: +# validator.logger.info("Validating file name...") +# if not validator.validate_filename(): +# validator.logger.info("Invalid filename: {}".format(file)) +# is_ok_to_run_validation = 0 + +# if is_ok_to_run_validation: +# validator.logger.info("Validating headers...") +# if not validator.validate_headers(): +# validator.logger.info("Invalid headers...exiting before any further checks") +# is_ok_to_run_validation = 0 + +# if is_ok_to_run_validation: +# validator.logger.info("Validating data...") +# validator.validate_data() + +# # Close log handler +# validator.logger.removeHandler(validator.handler) +# validator.handler.close() \ No newline at end of file diff --git a/pgscatalog_utils/validate/harmonized_position/__init__.py b/pgscatalog_utils/validate/harmonized_position/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog_utils/validate/harmonized_position/validator.py b/pgscatalog_utils/validate/harmonized_position/validator.py new file mode 100644 index 0000000..c12ca58 --- /dev/null +++ b/pgscatalog_utils/validate/harmonized_position/validator.py @@ -0,0 +1,137 @@ +import re +from pgscatalog_utils.validate.schemas import * +from pgscatalog_utils.validate.validator_base import * + +''' +PGS Catalog Harmonized file validator +- using pandas_schema https://github.com/TMiguelT/PandasSchema +''' + +class ValidatorPos(ValidatorBase): + ''' Validator for the HmPOS Harmonized file format. ''' + + def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): + super().__init__(file, score_dir, logfile, error_limit) + self.meta_format = HM_META_POS + self.validators = POS_VALIDATORS + self.valid_cols = VALID_COLS_POS + self.valid_type = VALID_TYPE_POS + + + def extract_specific_metadata(self,line): + ''' Extract some of the metadata. ''' + match_variants_number = re.search(r'#variants_number=(\d+)', line) + if match_variants_number: + self.variants_number = int(match_variants_number.group(1)) + + + def validate_line_content(self,cols_content,var_line_number): + ''' Populate the abstract method from ValidatorBase, to check some data in esch row. ''' + # Check lines + line_dict = dict(zip(self.header, cols_content)) + line_cols = line_dict.keys() + # Check each chromosome data is consistent + chr_cols = ['chr_name', 'hm_chr', 'hm_match_chr'] + if all(col_name in line_cols for col_name in chr_cols): + if line_dict['chr_name'] == line_dict['hm_chr'] and line_dict['hm_match_chr'] != 'True': + self.logger.error(f"- Variant line {var_line_number} | 'hm_match_chr' should be 'True': same chromosome ('chr_name={line_dict['chr_name']}' vs 'hm_chr={line_dict['hm_chr']}')") + # Check each position data is consistent + pos_cols = ['chr_position', 'hm_pos', 'hm_match_pos'] + if all(col_name in line_cols for col_name in pos_cols): + if line_dict['chr_position'] == line_dict['hm_pos'] and line_dict['hm_match_pos'] != 'True': + self.logger.error(f"- Variant line {var_line_number} | 'hm_match_pos' should be 'True': same position ('chr_position={line_dict['chr_position']}' vs 'hm_pos={line_dict['hm_pos']}')") + + + def validate_filename(self): + ''' Validate the file name structure. ''' + pgs_id, build = None, None + # hmPOS + filename = self.file.split('/')[-1].split('.')[0] + filename_parts = filename.split('_hmPOS_') + if len(filename_parts) != 2: + self.logger.error("Filename: {} should follow the pattern _hmPOS_.txt.gz [build=GRChXX]".format(filename)) + return False + else: + pgs_id, build = filename_parts + self.file_pgs_id = pgs_id + self.file_genomebuild = build + if not self.check_build_is_legit(build): + self.logger.error("Build: {} is not an accepted build value".format(build)) + return False + self.logger.info("Filename looks good!") + return True + + + def validate_headers(self): + ''' Validate the list of column names. ''' + # Check if it has at least a "SNP" column or a "chromosome" column + self.setup_field_validation() + required_is_subset = set(STD_COLS_VAR_POS).issubset(self.header) + if not required_is_subset: + self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_POS, self.header)) + + # Check if it has at least a "SNP" column or a "chromosome" column + required_pos = set(SNP_COLS_VAR_POS).issubset(self.header) + if not required_pos: + # check if everything but snp: + required_pos = set(CHR_COLS_VAR_POS).issubset(self.header) + if not required_pos: + self.logger.error("One of the following required header is missing: '{}' and/or '{}' are not in the file header: {}".format(SNP_COLS_VAR_POS, CHR_COLS_VAR_POS, self.header)) + required_is_subset = required_pos + + return required_is_subset + + +################################################################## + +def init_validator(file, logfile, score_dir=None) -> ValidatorPos: + validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile) + return validator + +# def run_validator(file, check_filename, logfile, score_dir=None): + +# validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile) + +# validator.logger.propagate = False + +# if not file or not logfile: +# validator.logger.info("Missing file and/or logfile") +# validator.logger.info("Exiting before any further checks") +# sys.exit() +# if not os.path.exists(file): +# validator.logger.info("Error: the file '"+file+"' can't be found") +# validator.logger.info("Exiting before any further checks") +# sys.exit() + +# is_ok_to_run_validation = 1 +# validator.logger.info("Validating file extension...") +# if not validator.validate_file_extension(): +# validator.logger.info("Invalid file extension: {}".format(file)) +# validator.logger.info("Exiting before any further checks") +# is_ok_to_run_validation = 0 + +# if is_ok_to_run_validation and check_filename: +# validator.logger.info("Validating file name...") +# if not validator.validate_filename(): +# validator.logger.info("Invalid filename: {}".format(file)) +# is_ok_to_run_validation = 0 + +# if is_ok_to_run_validation: +# validator.logger.info("Comparing filename with metadata...") +# if not validator.compare_with_filename(): +# validator.logger.info("Discrepancies between filename information and metadata: {}".format(file)) +# is_ok_to_run_validation = 0 + +# if is_ok_to_run_validation: +# validator.logger.info("Validating headers...") +# if not validator.validate_headers(): +# validator.logger.info("Invalid headers...exiting before any further checks") +# is_ok_to_run_validation = 0 + +# if is_ok_to_run_validation: +# validator.logger.info("Validating data...") +# validator.validate_data() + +# # Close log handler +# validator.logger.removeHandler(validator.handler) +# validator.handler.close() \ No newline at end of file diff --git a/pgscatalog_utils/validate/helpers.py b/pgscatalog_utils/validate/helpers.py new file mode 100644 index 0000000..7d786e5 --- /dev/null +++ b/pgscatalog_utils/validate/helpers.py @@ -0,0 +1,29 @@ +import math +import pandas as pd +from pandas_schema.validation import _SeriesValidation + + +class InInclusiveRangeValidation(_SeriesValidation): + """ + Checks that each element in the series is within a given inclusive numerical range. + Doesn't care if the values are not numeric - it will try anyway. + """ + def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): + """ + :param min: The minimum (inclusive) value to accept + :param max: The maximum (inclusive) value to accept + """ + self.min = min + self.max = max + super().__init__(**kwargs) + + @property + def default_message(self): + return 'was not in the range [{}, {})'.format(self.min, self.max) + + def validate(self, series: pd.Series) -> pd.Series: + series = pd.to_numeric(series, errors='coerce') + return (series >= self.min) & (series <= self.max) + + + diff --git a/pgscatalog_utils/validate/schemas.py b/pgscatalog_utils/validate/schemas.py new file mode 100644 index 0000000..7487b21 --- /dev/null +++ b/pgscatalog_utils/validate/schemas.py @@ -0,0 +1,158 @@ +import sys +import numpy as np +from pandas_schema import Column +from pandas_schema.validation import MatchesPatternValidation, InListValidation, CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation, CustomElementValidation +from pgscatalog_utils.validate.helpers import InInclusiveRangeValidation +from pgscatalog_utils.validate.common_constants import * + + +#### Validation types #### + +VALID_TYPE_FORMATTED = 'formatted' +VALID_TYPE_POS = 'hm_pos' + + +#### Columns #### + +# Formatted scoring files +STD_COLS_VAR_FORMATTED = (EFFECT_DSET, CHR_DSET, BP_DSET, SNP_DSET) #OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, SE_DSET, FREQ_DSET , EFFECT_DSET, OTH_DSET) + +SNP_COLS_VAR_FORMATTED = (EFFECT_DSET, CHR_DSET, BP_DSET) +CHR_COLS_VAR_FORMATTED = (EFFECT_DSET, SNP_DSET) + +STD_COLS_EFFECT_FORMATTED = (EFFECT_WEIGHT_DSET,OR_DSET,HR_DSET) + +VALID_COLS_FORMATTED = (EFFECT_WEIGHT_DSET, OR_DSET, HR_DSET, BETA_DSET, FREQ_DSET, LOCUS_DSET, EFFECT_DSET, OTH_DSET, CHR_DSET, BP_DSET, SNP_DSET) + +# Harmonized scoring files - POS +STD_COLS_VAR_POS = (HM_SOURCE_DSET, HM_CHR_DSET, HM_BP_DSET) + +SNP_COLS_VAR_POS = (SNP_DSET, HM_SNP_DSET) +CHR_COLS_VAR_POS = (CHR_DSET,) + +VALID_COLS_POS = (HM_SOURCE_DSET, HM_SNP_DSET, HM_CHR_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET) + +# Harmonized scoring files - Final +STD_COLS_VAR_FINAL = (EFFECT_DSET, EFFECT_WEIGHT_DSET, HM_CODE_DSET, HM_INFO_DSET) + +SNP_COLS_VAR_FINAL = (VARIANT_DSET,) +CHR_COLS_VAR_FINAL = (CHR_DSET, HM_CHR_DSET) + +VALID_COLS_FINAL = (SNP_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTH_DSET, EFFECT_WEIGHT_DSET, LOCUS_DSET, HM_CODE_DSET, HM_SNP_DSET, HM_CHR_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET) + + +#### Global variables #### + +VALID_CHROMOSOMES = ['1', '2', '3', '4', '5', '6', '7', '8', + '9', '10', '11', '12', '13', '14', '15', '16', + '17', '18', '19', '20', '21', '22', + 'X', 'x', 'Y', 'y', 'XY', 'xy', 'MT', 'Mt', 'mt'] + +VALID_FILE_EXTENSIONS = [".txt", ".txt.gz"] + +# For the harmonized files +VALID_SOURCES = ['ENSEMBL','Author-reported'] +# VALID_CODES = ['5','4','3','1','0','-1','-4','-5'] +BUILD_LIST = ['GRCh37','GRCh38'] + + +error_msg = 'this column cannot be null/empty' +null_validation = CustomElementValidation(lambda d: d is not np.nan and d != '', error_msg) + + +#### Validators #### + +# Generic/shared validators +GENERIC_VALIDATORS = { + CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), + BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True), + EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET]), null_validation], allow_empty=False), + EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=False), + OTH_DSET: Column(OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=True), + LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), null_validation], allow_empty=True) +} + +# Formatted validators +FORMATTED_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} +FORMATTED_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs|HLA\-\w+\*)[0-9]+$')], allow_empty=True) +FORMATTED_VALIDATORS[OR_DSET] = Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[BETA_DSET] = Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[FREQ_DSET] = Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[DOSAGE_0_WEIGHT] = Column(DOSAGE_0_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_0_WEIGHT]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[DOSAGE_1_WEIGHT] = Column(DOSAGE_1_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_1_WEIGHT]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[DOSAGE_2_WEIGHT] = Column(DOSAGE_2_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_2_WEIGHT]), null_validation], allow_empty=True) + +FORMATTED_VALIDATORS_SNP = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_SNP[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs|HLA\-\w+\*)[0-9]+$')], allow_empty=False) + +FORMATTED_VALIDATORS_SNP_EMPTY = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_SNP_EMPTY[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs[0-9]+|HLA\-\w+\*[0-9]+|nan)$')], allow_empty=False) +FORMATTED_VALIDATORS_SNP_EMPTY[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False) +FORMATTED_VALIDATORS_SNP_EMPTY[BP_DSET] = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False) + +FORMATTED_VALIDATORS_POS = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_POS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False) +FORMATTED_VALIDATORS_POS[BP_DSET] = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False) + +FORMATTED_VALIDATORS_OR = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_OR[OR_DSET] = Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=False) + +FORMATTED_VALIDATORS_HR = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_HR[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=False) + +# Position validators +POS_VALIDATORS = {} +POS_VALIDATORS[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET]), null_validation], allow_empty=True) +POS_VALIDATORS[HM_SOURCE_DSET] = Column(HM_SOURCE_DSET, [CanConvertValidation(DSET_TYPES[HM_SOURCE_DSET]), InListValidation(VALID_SOURCES), LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), null_validation], allow_empty=False) +POS_VALIDATORS[HM_SNP_DSET] = Column(HM_SNP_DSET, [CanConvertValidation(DSET_TYPES[HM_SNP_DSET]), MatchesPatternValidation(r'^(rs|HLA\-\w+\*)[0-9]+$')], allow_empty=True) +POS_VALIDATORS[HM_CHR_DSET] = Column(HM_CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True) +POS_VALIDATORS[HM_BP_DSET] = Column(HM_BP_DSET, [CanConvertValidation(DSET_TYPES[HM_BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True) +POS_VALIDATORS[HM_OTH_DSET] = Column(HM_OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-\/]+$')], allow_empty=True) +POS_VALIDATORS[HM_MATCH_CHR_DSET] = Column(HM_MATCH_CHR_DSET, [InListValidation(['True', 'False'])], allow_empty=True) +POS_VALIDATORS[HM_MATCH_BP_DSET] = Column(HM_MATCH_BP_DSET, [InListValidation(['True', 'False'])], allow_empty=True) + +# Final validator +# FINAL_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} +# FINAL_VALIDATORS[EFFECT_DSET] = Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=True) +# FINAL_VALIDATORS[OTH_DSET] = Column(OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-\.]+$')], allow_empty=True) +# FINAL_VALIDATORS[VARIANT_DSET] = Column(VARIANT_DSET, [CanConvertValidation(DSET_TYPES[VARIANT_DSET]), MatchesPatternValidation(r'^((rs|HLA\-\w+\*)[0-9]+|\.)$')], allow_empty=True) +# FINAL_VALIDATORS[HM_CODE_DSET] = Column(HM_CODE_DSET, [InListValidation(VALID_CODES), null_validation], allow_empty=True) +# FINAL_VALIDATORS[HM_INFO_DSET] = Column(HM_INFO_DSET, [CanConvertValidation(DSET_TYPES[HM_INFO_DSET]), null_validation], allow_empty=True) + + +#### Metadata entries #### + +FORMATTED_META_GENERIC = [ + '###PGS CATALOG SCORING FILE', + '#format_version', + '##POLYGENIC SCORE', + '#pgs_id', + '#pgs_name', + '#trait_reported', + '#trait_mapped', + '#trait_efo', + '#genome_build', + '#variants_number', + '#weight_type', + '##SOURCE INFORMATION', + '#pgp_id', + '#citation' +] + +HM_META_GENERIC = [ x for x in FORMATTED_META_GENERIC ] +HM_META_GENERIC.append('##HARMONIZATION DETAILS') + +HM_META_POS = [ x for x in HM_META_GENERIC ] +HM_META_POS.append('#HmPOS_build') +HM_META_POS.append('#HmPOS_date') +HM_META_POS.append('#HmPOS_match_chr') +HM_META_POS.append('#HmPOS_match_pos') + +# HM_META_FINAL = [ x for x in HM_META_GENERIC ] +# HM_META_FINAL.append('#Hm_file_version') +# HM_META_FINAL.append('#Hm_genome_build') +# HM_META_FINAL.append('#Hm_reference_source') +# HM_META_FINAL.append('#Hm_creation_date') +# HM_META_FINAL.append('#Hm_variants_number_matched') +# HM_META_FINAL.append('#Hm_variants_number_unmapped') \ No newline at end of file diff --git a/pgscatalog_utils/validate/validate_scorefile.py b/pgscatalog_utils/validate/validate_scorefile.py new file mode 100644 index 0000000..f31ef88 --- /dev/null +++ b/pgscatalog_utils/validate/validate_scorefile.py @@ -0,0 +1,203 @@ +import os, glob, re +import argparse +import logging + +data_sum = {'valid': [], 'invalid': [], 'other': []} + +val_types = ('formatted', 'hm_pos') + +logging.basicConfig(level=logging.INFO, format='(%(levelname)s): %(message)s') + +def _read_last_line(file: str) -> str: + ''' + Return the last line of the file + ''' + fileHandle = open ( file,"r" ) + lineList = fileHandle.readlines() + fileHandle.close() + return lineList[-1] + + +def _file_validation_state(filename: str, log_file: str) -> None: + global data_sum + if os.path.exists(log_file): + log_result = _read_last_line(log_file) + if re.search("File is valid", log_result): + print("> valid\n") + data_sum['valid'].append(filename) + elif re.search("File is invalid", log_result): + print("#### invalid! ####\n") + data_sum['invalid'].append(filename) + else:# + print("!! validation process had an issue. Please look at the logs.\n") + data_sum['other'].append(filename) + else: + print("!! validation process had an issue: the log file can't be found") + data_sum['other'].append(filename) + + +def _run_validator(validator: object, file: str, check_filename: bool, logfile: str, validator_type: str) -> None: + ''' Main method to run the PGS file validator ''' + validator.logger.propagate = False + + is_ok_to_continue_validation = 1 + + # Check files exist + if not file or not logfile: + validator.logger.info("Missing file and/or logfile") + is_ok_to_continue_validation = 0 + elif file and not os.path.exists(file): + validator.logger.info("Error: the file '"+file+"' can't be found") + is_ok_to_continue_validation = 0 + + # Validate file extension + validator.logger.info("Validating file extension...") + if not validator.validate_file_extension(): + validator.logger.info("Invalid file extension: {}".format(file)) + is_ok_to_continue_validation = 0 + # Validate file name nomenclature + if is_ok_to_continue_validation and check_filename: + validator.logger.info("Validating file name...") + if not validator.validate_filename(): + validator.logger.info("Invalid filename: {}".format(file)) + is_ok_to_continue_validation = 0 + + # Only for harmonized files + if is_ok_to_continue_validation and validator_type != 'formatted': + validator.logger.info("Comparing filename with metadata...") + if not validator.compare_with_filename(): + validator.logger.info("Discrepancies between filename information and metadata: {}".format(file)) + is_ok_to_continue_validation = 0 + + # Validate column headers + if is_ok_to_continue_validation: + validator.logger.info("Validating headers...") + if not validator.validate_headers(): + validator.logger.info("Invalid headers...exiting before any further checks") + is_ok_to_continue_validation = 0 + + # Validate data content + if is_ok_to_continue_validation: + validator.logger.info("Validating data...") + validator.validate_data() + + if is_ok_to_continue_validation == 0: + validator.logger.info("Exiting before any further checks") + + # Close log handler + validator.logger.removeHandler(validator.handler) + validator.handler.close() + + +def _check_args(args): + global score_dir + + ## Check parameters ## + # Type of validator + if args.t not in val_types: + print(f"Error: Validator type (option -t) '{args.t}' is not in the list of recognized types: {val_types}.") + exit(1) + # Logs dir + if not os.path.isdir(args.log_dir): + print(f"Error: Log dir '{args.log_dir}' can't be found!") + exit(1) + # File and directory parameters (only one of the '-f' and '--dir' can be used) + if args.f and args.dir: + print("Error: you can't use both options [-f] - single scoring file and [--dir] - directory of scoring files. Please use only 1 of these 2 options!") + exit(1) + elif not args.f and not args.dir: + print("Error: you need to provide a scoring file [-f] or a directory of scoring files [--dir]!") + exit(1) + elif args.f and not os.path.isfile(args.f): + print(f"Error: Scoring file '{args.f}' can't be found!") + exit(1) + elif args.dir and not os.path.isdir(args.dir): + print(f"Error: the scoring file directory '{args.dir}' can't be found!") + exit(1) + # Scoring files directory (only to compare with the harmonized files) + score_dir = None + if args.score_dir: + score_dir = args.score_dir + if not os.path.isdir(score_dir): + print(f"Error: Scoring file directory '{score_dir}' can't be found!") + exit(1) + elif args.t != 'formatted': + print("WARNING: the parameter '--score_dir' is not present in the submitted command line, therefore the comparison of the number of data rows between the formatted scoring file(s) and the harmonized scoring file(s) won't be performed.") + + +def validate_file(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None: + ''' Run the file validator ''' + file = os.path.basename(filepath) + filename = file.split('.')[0] + print(f"# Filename: {file}") + log_file = log_dir+'/'+filename+'_log.txt' + + # Run validator + validator = validator_package.init_validator(filepath,log_file,score_dir) + _run_validator(validator,filepath,check_filename,log_file,validator_type) + + # Check log + _file_validation_state(file,log_file) + + +def main(): + global data_sum, score_dir + + argparser = argparse.ArgumentParser() + argparser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE') + argparser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME') + argparser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option') + argparser.add_argument('--score_dir', help=' The name of the directory containing the formatted scoring files to compare with harmonized scoring files') + argparser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True) + argparser.add_argument('--check_filename', help=' Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true') + + args = argparser.parse_args() + + ## Check parameters ## + _check_args(args) + + # Check PGS Catalog file name nomenclature + check_filename = False + if args.check_filename: + check_filename = True + else: + print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.") + + validator_type = args.t + files_dir = args.dir + + log_dir = args.log_dir + + ## Select validator class ## + if validator_type == 'formatted': + import pgscatalog_utils.validate.formatted.validator as validator_package + elif validator_type == 'hm_pos': + import pgscatalog_utils.validate.harmonized_position.validator as validator_package + + ## Run validator ## + # One file + if args.f: + validate_file(args.f,log_dir,score_dir,validator_package,check_filename,validator_type) + # Content of the directory + elif files_dir: + count_files = 0 + # Browse directory: for each file run validator + for filepath in sorted(glob.glob(files_dir+"/*.*")): + validate_file(filepath,log_dir,score_dir,validator_package,check_filename,validator_type) + count_files += 1 + + # Print summary + results + print("\nSummary:") + if data_sum['valid']: + print(f"- Valid: {len(data_sum['valid'])}/{count_files}") + if data_sum['invalid']: + print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}") + if data_sum['other']: + print(f"- Other issues: {len(data_sum['other'])}/{count_files}") + + if data_sum['invalid']: + print("Invalid files:") + print("\n".join(data_sum['invalid'])) + +if __name__ == '__main__': + main() diff --git a/pgscatalog_utils/validate/validator_base.py b/pgscatalog_utils/validate/validator_base.py new file mode 100644 index 0000000..f76adf1 --- /dev/null +++ b/pgscatalog_utils/validate/validator_base.py @@ -0,0 +1,364 @@ +import os, sys, gc +import gzip +import csv +import pathlib +import logging +import re +from typing import List +import pandas as pd +import pandas_schema +from pgscatalog_utils.validate.schemas import * +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='pandas_schema') + +''' +PGS Catalog file validator +- using pandas_schema https://github.com/TMiguelT/PandasSchema +''' + + +csv.field_size_limit(sys.maxsize) + +class ValidatorBase: + + valid_extensions = VALID_FILE_EXTENSIONS + validators = GENERIC_VALIDATORS + valid_cols = [] + valid_type = '' + sep = '\t' + + def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): + self.file = file + self.score_dir = score_dir + self.schema = None + self.header = [] + self.genomebuild = None + self.comment_lines_count = 1 # Counting the header line + self.cols_to_validate = [] + self.cols_to_read = [] + self.bad_rows = [] + self.row_errors = [] + self.errors_seen = {} + self.logfile = logfile + self.error_limit = int(error_limit) + + # Logging variables + self.logger = logging.getLogger(__name__) + self.handler = logging.FileHandler(self.logfile, 'w+') + self.handler.setLevel(logging.INFO) + self.logger.addHandler(self.handler) + self.logger.propagate = False + + self.global_errors = 0 + self.variants_number = 0 + + + def setup_field_validation(self): + ''' + Fetch the header and build the list of column to check/validate + ''' + self.header = self.get_header() + self.cols_to_validate = [h for h in self.header if h in self.valid_cols] + self.cols_to_read = [h for h in self.header if h in self.valid_cols] + + + def get_header(self): + ''' + Fetch the header (i.e. column names) information from the harmonized scoring file and store the list in a variable + ''' + first_row = pd.read_csv(self.file, sep=self.sep, comment='#', nrows=1, index_col=False) + # Check if the column headers have leading and/or trailing spaces + # The leading/trailing spaces should raise an error during the header validation + has_trailing_spaces = self.check_leading_trailing_spaces(first_row.columns.values) + if has_trailing_spaces: + self.global_errors += 1 + return first_row.columns.values + + + def get_genomebuild(self): + ''' Retrieve the Genome Build from the comments ''' + if self.valid_type == 'hm_pos': + self.genomebuild = self.get_comments_info('#HmPOS_build') + else: + self.genomebuild = self.get_comments_info('#Hm_genome_build') + + + def get_pgs_id(self): + ''' Retrieve the PGS ID from the comments ''' + self.pgs_id = self.get_comments_info('#pgs_id') + + + def validate_content(self): + ''' Validate the file content and verify that the number of variant lines corresponds to the number of variants in the headers ''' + variant_lines_count = 0 + meta_lines_count = 0 + + with gzip.open( self.file, 'rb') as f: + line_number = 0 + file_meta = [] + for line in f: + line_number += 1 + line = line.decode('utf-8').rstrip() + # Check Metadata + if line.startswith('#'): + self.extract_specific_metadata(line) + # Check that we have all the meta information + for meta in self.meta_format: + if line.startswith(meta): + file_meta.append(meta) + meta_lines_count += 1 + break + + # Check data + else: + variant_lines_count += 1 + if re.search('\w+', line): # Line not empty + cols_content = line.split(self.sep) + has_trailing_spaces = self.check_leading_trailing_spaces(cols_content,line_number) + if has_trailing_spaces: + self.global_errors += 1 + + if line.startswith('rsID') or line.startswith('chr_name'): + continue + + self.validate_line_content(cols_content,variant_lines_count) + else: + self.logger.error(f'- Line {line_number} is empty') + self.global_errors += 1 + + # Compare the number of metadata lines: read vs expected + if meta_lines_count != len(self.meta_format): + self.logger.error(f'- The number of metadata lines [i.e. starting with the "#" character] in the file ({meta_lines_count}) and the expected number of metadata lines ({len(self.meta_format)}) are different') + diff_list = list(set(self.meta_format).difference(file_meta)) + self.logger.error(f" > Missing metadata line(s): {', '.join(diff_list)}") + self.global_errors += 1 + + + def validate_data(self): + ''' Validate the file: data format and data content ''' + if not self.open_file_and_check_for_squareness(): + self.logger.error("Please fix the table. Some rows have different numbers of columns to the header") + self.logger.info("Rows with different numbers of columns to the header are not validated") + + # Validate data content and check the consitence between the declared variants number and the actual number of variants in the file + self.validate_content() + for chunk in self.df_iterator(self.file): + to_validate = chunk[self.cols_to_read] + to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded + + # Schema validation + self.schema = pandas_schema.Schema([self.validators[h] for h in self.cols_to_validate]) + errors = self.schema.validate(to_validate) + self.store_errors(errors) + + self.process_errors() + if len(self.bad_rows) >= self.error_limit: + break + + if not self.bad_rows and not self.global_errors: + self.logger.info("File is valid") + return True + else: + self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit)) + return False + + + def process_errors(self): + ''' Populate the logger error and the list of bad rows with the errors found. ''' + for error in self.row_errors: + if len(self.bad_rows) < self.error_limit or self.error_limit < 1: + self.logger.error(error) + if error.row not in self.bad_rows: + self.bad_rows.append(error.row) + self.row_errors = [] + + + def store_errors(self, errors: List[pandas_schema.validation_warning.ValidationWarning]): + ''' Capture the errors found into a temporary structure before being processed. ''' + for error in errors: + seen = 0 + row_number = error.row + file_line_number = row_number + self.comment_lines_count + 1 # rows are 0 indexes + error.row = str(row_number) + " (line "+str(file_line_number)+")" + col = error.column + # Avoid duplication as the errors can be detected several times + if row_number in self.errors_seen.keys(): + if col in self.errors_seen[row_number].keys(): + seen = 1 + else: + self.errors_seen[row_number][col] = 1 + else: + self.errors_seen[row_number] = { col : 1 } + if seen == 0: + self.row_errors.append(error) + + + def validate_file_extension(self): + ''' Check/validate the file name extension. ''' + check_exts = [self.check_ext(ext) for ext in self.valid_extensions] + if not any(check_exts): + self.valid_ext = False + self.logger.error("File extension should be in {}".format(self.valid_extensions)) + return False + else: + self.valid_ext = True + return True + + + def compare_number_of_rows(self): + ''' Compare the number of data rows between the harmonized and the formatted scoring files. ''' + # Harmonization file - length + hm_rows_count = 0 + for chunk in self.df_iterator(self.file): + hm_rows_count += len(chunk.index) + gc.collect() + + # Formatted scoring file - length + scoring_rows_count = 0 + scoring_file = f'{self.score_dir}/{self.pgs_id}.txt.gz' + if os.path.isfile(scoring_file): + for score_chunk in self.df_iterator(scoring_file): + scoring_rows_count += len(score_chunk.index) + gc.collect() + + comparison_status = True + if scoring_rows_count == 0: + self.logger.error(f"Can't find the Scoring file '{scoring_file}' to compare the number of rows with the harmonization file!") + comparison_status = False + elif hm_rows_count != scoring_rows_count: + self.logger.error(f'The number of data rows between the Scoring file ({scoring_rows_count}) and the Harmonization POS file ({hm_rows_count}) are different') + comparison_status = False + return comparison_status + + + def compare_with_filename(self): + ''' Check that the filename matches the information present in the file metadata (PGS ID, genome build). ''' + comparison_status = True + if hasattr(self,'file_genomebuild') and hasattr(self,'file_pgs_id'): + # Extract some metadata + self.get_genomebuild() + self.get_pgs_id() + # Compare metadata with filename information + if self.file_genomebuild != self.genomebuild: + self.logger.error("Build: the genome build in the HmPOS_build header ({}) is different from the one on the filename ({})".format(self.genomebuild,self.file_genomebuild)) + check_status = False + if self.file_pgs_id != self.pgs_id: + self.logger.error("ID: the PGS ID of the header ({}) is different from the one on the filename ({})".format(self.pgs_id,self.file_pgs_id)) + check_status = False + # Compare number of rows with Scoring file + if self.score_dir: + row_comparison_status = self.compare_number_of_rows() + if row_comparison_status == False: + comparison_status = row_comparison_status + else: + self.logger.info("Comparison of the number of rows between Harmonized and Scoring file skipped!") + return comparison_status + + + def df_iterator(self, data_file: str): + ''' Setup a pandas dataframe iterator. ''' + df = pd.read_csv(data_file, + sep=self.sep, + dtype=str, + comment='#', + chunksize=1000000) + return df + + + def check_file_is_square(self, csv_file: str): + ''' Check that each row has the name number of columns. ''' + square = True + csv_file.seek(0) + reader = csv.reader(csv_file, delimiter=self.sep) + count = 1 + for row in reader: + if len(row) != 0: + if row[0].startswith('#'): + self.comment_lines_count += 1 + continue + if (len(row) != len(self.header)): + self.logger.error("Length of row {c} is: {l} instead of {h}".format(c=count, l=str(len(row)), h=str(len(self.header)))) + self.logger.error("ROW: "+str(row)) + square = False + count += 1 + del csv_file + return square + + + def open_file_and_check_for_squareness(self): + ''' Method to read the file in order to check that each row has the name number of columns. ''' + if pathlib.Path(self.file).suffix in [".gz", ".gzip"]: + with gzip.open(self.file, 'rt') as f: + return self.check_file_is_square(f) + else: + with open(self.file) as f: + return self.check_file_is_square(f) + + + def check_leading_trailing_spaces(self, cols:str, line_number:str = None): + ''' + Check if the columns have leading and/or trailing spaces. + The leading/trailing spaces should raise an error during the validation. + ''' + leading_trailing_spaces = [] + found_trailing_spaces = False + for idx, col in enumerate(cols): + if col.startswith(' ') or col.endswith(' '): + leading_trailing_spaces.append(self.header[idx]+' => |'+str(col)+'|') + if len(leading_trailing_spaces): + if line_number: + line_name = f'line {line_number} has' + else: + line_name = 'following headers have' + self.logger.error("The "+line_name+" leading and/or trailing spaces: "+' ; '.join(leading_trailing_spaces)) + found_trailing_spaces = True + return found_trailing_spaces + + + def check_ext(self, ext:str) -> bool: + if self.file.endswith(ext): + return True + return False + + + def check_build_is_legit(self, build:str) -> bool: + if build in BUILD_LIST: + return True + return False + + + def get_comments_info(self, type:str) -> str: + ''' Retrieve information from the comments ''' + with gzip.open(self.file, 'rb') as f_in: + for f_line in f_in: + line = f_line.decode() + # Update header + if line.startswith(type): + info = (line.split('='))[1] + return info.strip() + + + def validate_filename(self): + ''' Validate the file name structure. ''' + print("To be implemented in inherited classes") + pass + + + def validate_headers(self): + ''' Validate the list of column names. ''' + print("To be implemented in inherited classes") + pass + + + def validate_line_content(self, cols_content:str, var_line_number:int): + ''' Validate each data row. ''' + print("To be implemented in inherited classes") + pass + + + def extract_specific_metadata(self, line:str): + ''' Extra method to extract and validate specific data. ''' + print("To be implemented in inherited classes") + pass + From 08409aafdf47ece789830ea26e2b88f34e5befcf Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Tue, 6 Sep 2022 11:27:46 +0100 Subject: [PATCH 02/46] Python 3.10 compatibility changes and minor updates --- pgscatalog_utils/validate/formatted/validator.py | 6 +++--- pgscatalog_utils/validate/harmonized_position/validator.py | 2 +- pgscatalog_utils/validate/validator_base.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pgscatalog_utils/validate/formatted/validator.py b/pgscatalog_utils/validate/formatted/validator.py index 0a5ed3b..16bd425 100644 --- a/pgscatalog_utils/validate/formatted/validator.py +++ b/pgscatalog_utils/validate/formatted/validator.py @@ -20,6 +20,7 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): self.validators = FORMATTED_VALIDATORS self.valid_cols = VALID_COLS_FORMATTED self.valid_type = VALID_TYPE_FORMATTED + self.setup_field_validation() def extract_specific_metadata(self,line): @@ -44,7 +45,7 @@ def get_and_check_variants_number(self): self.variants_number = int(match_variants_number.group(1)) else: variant_lines += 1 - if re.search('\w+', line): # Line not empty + if re.search(r'\w+', line): # Line not empty cols = line.split(self.sep) has_trailing_spaces = self.check_leading_trailing_spaces(cols,line_number) if has_trailing_spaces: @@ -128,7 +129,7 @@ def validate_data(self): def validate_filename(self): filename = self.file.split('/')[-1].split('.')[0] - if re.match('^PGS\d{6}$', filename): + if re.match(r'^PGS\d{6}$', filename): return True else: self.logger.error("Filename: {} should follow the pattern 'PGSXXXXXX.txt.gz', where the 'X' are the 6 digits of the PGS identifier (e.g. PGS000001)".format(filename)) @@ -136,7 +137,6 @@ def validate_filename(self): def validate_headers(self): - self.setup_field_validation() self.detect_genomebuild_with_rsid() required_is_subset = set(STD_COLS_VAR_FORMATTED).issubset(self.header) if not required_is_subset: diff --git a/pgscatalog_utils/validate/harmonized_position/validator.py b/pgscatalog_utils/validate/harmonized_position/validator.py index c12ca58..3058da3 100644 --- a/pgscatalog_utils/validate/harmonized_position/validator.py +++ b/pgscatalog_utils/validate/harmonized_position/validator.py @@ -16,6 +16,7 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): self.validators = POS_VALIDATORS self.valid_cols = VALID_COLS_POS self.valid_type = VALID_TYPE_POS + self.setup_field_validation() def extract_specific_metadata(self,line): @@ -65,7 +66,6 @@ def validate_filename(self): def validate_headers(self): ''' Validate the list of column names. ''' # Check if it has at least a "SNP" column or a "chromosome" column - self.setup_field_validation() required_is_subset = set(STD_COLS_VAR_POS).issubset(self.header) if not required_is_subset: self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_POS, self.header)) diff --git a/pgscatalog_utils/validate/validator_base.py b/pgscatalog_utils/validate/validator_base.py index f76adf1..6f9bdc8 100644 --- a/pgscatalog_utils/validate/validator_base.py +++ b/pgscatalog_utils/validate/validator_base.py @@ -113,7 +113,7 @@ def validate_content(self): # Check data else: variant_lines_count += 1 - if re.search('\w+', line): # Line not empty + if re.search(r'\w+', line): # Line not empty cols_content = line.split(self.sep) has_trailing_spaces = self.check_leading_trailing_spaces(cols_content,line_number) if has_trailing_spaces: From e2fc20913fcb2b237df9be43e1f988c69985f18e Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Wed, 14 Sep 2022 13:22:03 +0100 Subject: [PATCH 03/46] Code updates and add tests for validation --- .../validate/formatted/validator.py | 81 ++++------ .../validate/harmonized_position/validator.py | 73 ++------- .../validate/validate_scorefile.py | 85 +++++----- pgscatalog_utils/validate/validator_base.py | 73 ++++++++- tests/data/test_scoring_file_1.txt.gz | Bin 0 -> 1071 bytes tests/data/test_scoring_file_2.txt.gz | Bin 0 -> 877 bytes tests/data/test_scoring_file_3.txt.gz | Bin 0 -> 876 bytes tests/data/test_scoring_file_4.txt.gz | Bin 0 -> 1076 bytes .../data/test_scoring_file_hmpos_37_1.txt.gz | Bin 0 -> 1257 bytes .../data/test_scoring_file_hmpos_37_2.txt.gz | Bin 0 -> 1157 bytes .../data/test_scoring_file_hmpos_37_3.txt.gz | Bin 0 -> 973 bytes .../data/test_scoring_file_hmpos_38_1.txt.gz | Bin 0 -> 1335 bytes .../data/test_scoring_file_hmpos_38_2.txt.gz | Bin 0 -> 1163 bytes .../data/test_scoring_file_hmpos_38_3.txt.gz | Bin 0 -> 975 bytes tests/test_validate.py | 148 ++++++++++++++++++ 15 files changed, 293 insertions(+), 167 deletions(-) create mode 100644 tests/data/test_scoring_file_1.txt.gz create mode 100644 tests/data/test_scoring_file_2.txt.gz create mode 100644 tests/data/test_scoring_file_3.txt.gz create mode 100644 tests/data/test_scoring_file_4.txt.gz create mode 100644 tests/data/test_scoring_file_hmpos_37_1.txt.gz create mode 100644 tests/data/test_scoring_file_hmpos_37_2.txt.gz create mode 100644 tests/data/test_scoring_file_hmpos_37_3.txt.gz create mode 100644 tests/data/test_scoring_file_hmpos_38_1.txt.gz create mode 100644 tests/data/test_scoring_file_hmpos_38_2.txt.gz create mode 100644 tests/data/test_scoring_file_hmpos_38_3.txt.gz create mode 100644 tests/test_validate.py diff --git a/pgscatalog_utils/validate/formatted/validator.py b/pgscatalog_utils/validate/formatted/validator.py index 16bd425..eda02cc 100644 --- a/pgscatalog_utils/validate/formatted/validator.py +++ b/pgscatalog_utils/validate/formatted/validator.py @@ -81,7 +81,9 @@ def detect_duplicated_rows(self,dataframe_chunk): self.bad_rows.append(index) - def validate_data(self): + def validate_data(self) -> bool: + ''' Validate the file: data format and data content ''' + self.logger.info("Validating data...") if not self.open_file_and_check_for_squareness(): self.logger.error("Please fix the table. Some rows have different numbers of columns to the header") self.logger.info("Rows with different numbers of columns to the header are not validated") @@ -119,24 +121,33 @@ def validate_data(self): if len(self.bad_rows) >= self.error_limit: break if not self.bad_rows and not self.global_errors: - self.logger.info("File is valid") - return True - + if self.is_file_valid(): + self.logger.info("File is valid") + else: + self.logger.info("File is invalid") else: self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit)) - return False + self.set_file_is_invalid() + return self.is_file_valid() - def validate_filename(self): + def validate_filename(self) -> bool: + ''' Validate the file name structure. ''' + self.logger.info("Validating file name...") filename = self.file.split('/')[-1].split('.')[0] - if re.match(r'^PGS\d{6}$', filename): - return True - else: + is_valid_filename = True + if not re.match(r'^PGS\d{6}$', filename): + self.logger.info("Invalid filename: {}".format(self.file)) self.logger.error("Filename: {} should follow the pattern 'PGSXXXXXX.txt.gz', where the 'X' are the 6 digits of the PGS identifier (e.g. PGS000001)".format(filename)) - return False + is_valid_filename = False + self.set_file_is_invalid() + + return is_valid_filename - def validate_headers(self): + def validate_headers(self) -> bool: + ''' Validate the list of column names. ''' + self.logger.info("Validating headers...") self.detect_genomebuild_with_rsid() required_is_subset = set(STD_COLS_VAR_FORMATTED).issubset(self.header) if not required_is_subset: @@ -157,6 +168,10 @@ def validate_headers(self): self.logger.error("Required headers: at least one of the columns '{}' must be in the file header: {}".format(STD_COLS_EFFECT_FORMATTED, self.header)) required_is_subset = None + if not required_is_subset: + self.logger.info("Invalid headers...exiting before any further checks") + self.set_file_is_invalid() + return required_is_subset @@ -185,46 +200,4 @@ def get_genomebuild(self): def init_validator(file, logfile, score_dir=None) -> ValidatorFormatted: validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile) - return validator - -# def run_validator(file, check_filename, logfile, score_dir=None): - -# validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile) - -# validator.logger.propagate = False - -# if not file or not logfile: -# validator.logger.info("Missing file and/or logfile") -# validator.logger.info("Exiting before any further checks") -# sys.exit() -# if not os.path.exists(file): -# validator.logger.info("Error: the file '"+file+"' can't be found") -# validator.logger.info("Exiting before any further checks") -# sys.exit() - -# is_ok_to_run_validation = 1 -# validator.logger.info("Validating file extension...") -# if not validator.validate_file_extension(): -# validator.logger.info("Invalid file extension: {}".format(file)) -# validator.logger.info("Exiting before any further checks") -# is_ok_to_run_validation = 0 - -# if is_ok_to_run_validation and check_filename: -# validator.logger.info("Validating file name...") -# if not validator.validate_filename(): -# validator.logger.info("Invalid filename: {}".format(file)) -# is_ok_to_run_validation = 0 - -# if is_ok_to_run_validation: -# validator.logger.info("Validating headers...") -# if not validator.validate_headers(): -# validator.logger.info("Invalid headers...exiting before any further checks") -# is_ok_to_run_validation = 0 - -# if is_ok_to_run_validation: -# validator.logger.info("Validating data...") -# validator.validate_data() - -# # Close log handler -# validator.logger.removeHandler(validator.handler) -# validator.handler.close() \ No newline at end of file + return validator \ No newline at end of file diff --git a/pgscatalog_utils/validate/harmonized_position/validator.py b/pgscatalog_utils/validate/harmonized_position/validator.py index 3058da3..b46e8c4 100644 --- a/pgscatalog_utils/validate/harmonized_position/validator.py +++ b/pgscatalog_utils/validate/harmonized_position/validator.py @@ -43,28 +43,33 @@ def validate_line_content(self,cols_content,var_line_number): self.logger.error(f"- Variant line {var_line_number} | 'hm_match_pos' should be 'True': same position ('chr_position={line_dict['chr_position']}' vs 'hm_pos={line_dict['hm_pos']}')") - def validate_filename(self): + def validate_filename(self) -> bool: ''' Validate the file name structure. ''' + self.logger.info("Validating file name...") pgs_id, build = None, None + is_valid_filename = True # hmPOS filename = self.file.split('/')[-1].split('.')[0] filename_parts = filename.split('_hmPOS_') if len(filename_parts) != 2: self.logger.error("Filename: {} should follow the pattern _hmPOS_.txt.gz [build=GRChXX]".format(filename)) - return False + self.set_file_is_invalid() + is_valid_filename = False else: pgs_id, build = filename_parts self.file_pgs_id = pgs_id self.file_genomebuild = build if not self.check_build_is_legit(build): self.logger.error("Build: {} is not an accepted build value".format(build)) - return False - self.logger.info("Filename looks good!") - return True + self.set_file_is_invalid() + is_valid_filename = False + return is_valid_filename - def validate_headers(self): + + def validate_headers(self) -> bool: ''' Validate the list of column names. ''' + self.logger.info("Validating headers...") # Check if it has at least a "SNP" column or a "chromosome" column required_is_subset = set(STD_COLS_VAR_POS).issubset(self.header) if not required_is_subset: @@ -78,7 +83,11 @@ def validate_headers(self): if not required_pos: self.logger.error("One of the following required header is missing: '{}' and/or '{}' are not in the file header: {}".format(SNP_COLS_VAR_POS, CHR_COLS_VAR_POS, self.header)) required_is_subset = required_pos - + + if not required_is_subset: + self.logger.info("Invalid headers...exiting before any further checks") + self.set_file_is_invalid() + return required_is_subset @@ -86,52 +95,4 @@ def validate_headers(self): def init_validator(file, logfile, score_dir=None) -> ValidatorPos: validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile) - return validator - -# def run_validator(file, check_filename, logfile, score_dir=None): - -# validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile) - -# validator.logger.propagate = False - -# if not file or not logfile: -# validator.logger.info("Missing file and/or logfile") -# validator.logger.info("Exiting before any further checks") -# sys.exit() -# if not os.path.exists(file): -# validator.logger.info("Error: the file '"+file+"' can't be found") -# validator.logger.info("Exiting before any further checks") -# sys.exit() - -# is_ok_to_run_validation = 1 -# validator.logger.info("Validating file extension...") -# if not validator.validate_file_extension(): -# validator.logger.info("Invalid file extension: {}".format(file)) -# validator.logger.info("Exiting before any further checks") -# is_ok_to_run_validation = 0 - -# if is_ok_to_run_validation and check_filename: -# validator.logger.info("Validating file name...") -# if not validator.validate_filename(): -# validator.logger.info("Invalid filename: {}".format(file)) -# is_ok_to_run_validation = 0 - -# if is_ok_to_run_validation: -# validator.logger.info("Comparing filename with metadata...") -# if not validator.compare_with_filename(): -# validator.logger.info("Discrepancies between filename information and metadata: {}".format(file)) -# is_ok_to_run_validation = 0 - -# if is_ok_to_run_validation: -# validator.logger.info("Validating headers...") -# if not validator.validate_headers(): -# validator.logger.info("Invalid headers...exiting before any further checks") -# is_ok_to_run_validation = 0 - -# if is_ok_to_run_validation: -# validator.logger.info("Validating data...") -# validator.validate_data() - -# # Close log handler -# validator.logger.removeHandler(validator.handler) -# validator.handler.close() \ No newline at end of file + return validator \ No newline at end of file diff --git a/pgscatalog_utils/validate/validate_scorefile.py b/pgscatalog_utils/validate/validate_scorefile.py index f31ef88..3e38bf4 100644 --- a/pgscatalog_utils/validate/validate_scorefile.py +++ b/pgscatalog_utils/validate/validate_scorefile.py @@ -38,55 +38,42 @@ def _file_validation_state(filename: str, log_file: str) -> None: def _run_validator(validator: object, file: str, check_filename: bool, logfile: str, validator_type: str) -> None: ''' Main method to run the PGS file validator ''' - validator.logger.propagate = False - - is_ok_to_continue_validation = 1 - - # Check files exist - if not file or not logfile: - validator.logger.info("Missing file and/or logfile") - is_ok_to_continue_validation = 0 - elif file and not os.path.exists(file): - validator.logger.info("Error: the file '"+file+"' can't be found") - is_ok_to_continue_validation = 0 - - # Validate file extension - validator.logger.info("Validating file extension...") - if not validator.validate_file_extension(): - validator.logger.info("Invalid file extension: {}".format(file)) - is_ok_to_continue_validation = 0 - # Validate file name nomenclature - if is_ok_to_continue_validation and check_filename: - validator.logger.info("Validating file name...") - if not validator.validate_filename(): - validator.logger.info("Invalid filename: {}".format(file)) - is_ok_to_continue_validation = 0 - - # Only for harmonized files - if is_ok_to_continue_validation and validator_type != 'formatted': - validator.logger.info("Comparing filename with metadata...") - if not validator.compare_with_filename(): - validator.logger.info("Discrepancies between filename information and metadata: {}".format(file)) - is_ok_to_continue_validation = 0 - - # Validate column headers - if is_ok_to_continue_validation: - validator.logger.info("Validating headers...") - if not validator.validate_headers(): - validator.logger.info("Invalid headers...exiting before any further checks") - is_ok_to_continue_validation = 0 - - # Validate data content - if is_ok_to_continue_validation: - validator.logger.info("Validating data...") - validator.validate_data() - - if is_ok_to_continue_validation == 0: - validator.logger.info("Exiting before any further checks") - - # Close log handler - validator.logger.removeHandler(validator.handler) - validator.handler.close() + if check_filename: + validator.run_validator() + else: + validator.run_validator_skip_check_filename() + # validator.logger.propagate = False + + # # Check files exist + # if not file or not logfile: + # validator.logger.info("Missing file and/or logfile") + # validator.set_file_is_invalid() + # elif file and not os.path.exists(file): + # validator.logger.info("Error: the file '"+file+"' can't be found") + # validator.set_file_is_invalid() + + # # Validate file extension + # validator.validate_file_extension() + + # # Validate file name nomenclature + # if validator.is_file_valid() and check_filename: + # validator.validate_filename() + + # # Only for harmonized files + # if validator.is_file_valid() and validator_type != 'formatted': + # validator.compare_with_filename() + + # # Validate column headers + # if validator.is_file_valid(): + # validator.validate_headers() + + # # Validate data content + # if validator.is_file_valid(): + # validator.validate_data() + + # # Close log handler + # validator.logger.removeHandler(validator.handler) + # validator.handler.close() def _check_args(args): diff --git a/pgscatalog_utils/validate/validator_base.py b/pgscatalog_utils/validate/validator_base.py index 6f9bdc8..80af5c4 100644 --- a/pgscatalog_utils/validate/validator_base.py +++ b/pgscatalog_utils/validate/validator_base.py @@ -42,6 +42,7 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): self.errors_seen = {} self.logfile = logfile self.error_limit = int(error_limit) + self.is_valid = True # Logging variables self.logger = logging.getLogger(__name__) @@ -135,8 +136,9 @@ def validate_content(self): self.global_errors += 1 - def validate_data(self): + def validate_data(self) -> bool: ''' Validate the file: data format and data content ''' + self.logger.info("Validating data...") if not self.open_file_and_check_for_squareness(): self.logger.error("Please fix the table. Some rows have different numbers of columns to the header") self.logger.info("Rows with different numbers of columns to the header are not validated") @@ -156,12 +158,21 @@ def validate_data(self): if len(self.bad_rows) >= self.error_limit: break - if not self.bad_rows and not self.global_errors: + if not self.bad_rows and not self.global_errors and self.is_valid: self.logger.info("File is valid") - return True else: self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit)) - return False + self.set_file_is_invalid() + return self.is_valid + + + def is_file_valid(self) -> bool: + ''' Method returning the boolean value: True if the file is valid, False if the file is invalid. ''' + return self.is_valid + + def set_file_is_invalid(self): + ''' Set the flag "is_valid" to False. ''' + self.is_valid = False def process_errors(self): @@ -196,14 +207,16 @@ def store_errors(self, errors: List[pandas_schema.validation_warning.ValidationW def validate_file_extension(self): ''' Check/validate the file name extension. ''' + self.logger.info("Validating file extension...") check_exts = [self.check_ext(ext) for ext in self.valid_extensions] if not any(check_exts): self.valid_ext = False + self.set_file_is_invalid() + self.logger.info("Invalid file extension: {}".format(self.file)) self.logger.error("File extension should be in {}".format(self.valid_extensions)) - return False else: self.valid_ext = True - return True + return self.valid_ext def compare_number_of_rows(self): @@ -234,6 +247,7 @@ def compare_number_of_rows(self): def compare_with_filename(self): ''' Check that the filename matches the information present in the file metadata (PGS ID, genome build). ''' + self.logger.info("Comparing filename with metadata...") comparison_status = True if hasattr(self,'file_genomebuild') and hasattr(self,'file_pgs_id'): # Extract some metadata @@ -242,10 +256,10 @@ def compare_with_filename(self): # Compare metadata with filename information if self.file_genomebuild != self.genomebuild: self.logger.error("Build: the genome build in the HmPOS_build header ({}) is different from the one on the filename ({})".format(self.genomebuild,self.file_genomebuild)) - check_status = False + comparison_status = False if self.file_pgs_id != self.pgs_id: self.logger.error("ID: the PGS ID of the header ({}) is different from the one on the filename ({})".format(self.pgs_id,self.file_pgs_id)) - check_status = False + comparison_status = False # Compare number of rows with Scoring file if self.score_dir: row_comparison_status = self.compare_number_of_rows() @@ -253,6 +267,9 @@ def compare_with_filename(self): comparison_status = row_comparison_status else: self.logger.info("Comparison of the number of rows between Harmonized and Scoring file skipped!") + if not comparison_status: + self.logger.info("Discrepancies between filename information and metadata: {}".format(self.file)) + self.set_file_is_invalid() return comparison_status @@ -338,6 +355,46 @@ def get_comments_info(self, type:str) -> str: info = (line.split('='))[1] return info.strip() + def run_generic_validator(self,check_filename): + self.logger.propagate = False + + # Check files exist + if not self.file or not self.logfile: + self.logger.info("Missing file and/or logfile") + self.set_file_is_invalid() + elif self.file and not os.path.exists(self.file): + self.logger.info("Error: the file '"+self.file+"' can't be found") + self.set_file_is_invalid() + + # Validate file extension + self.validate_file_extension() + + # Validate file name nomenclature + if self.is_file_valid() and check_filename: + self.validate_filename() + + # Only for harmonized files + if self.is_file_valid() and type(self).__name__ != 'ValidatorFormatted': + self.compare_with_filename() + + # Validate column headers + if self.is_file_valid(): + self.validate_headers() + + # Validate data content + if self.is_file_valid(): + self.validate_data() + + # Close log handler + self.logger.removeHandler(self.handler) + self.handler.close() + + def run_validator(self): + self.run_generic_validator(True) + + def run_validator_skip_check_filename(self): + self.run_generic_validator(False) + def validate_filename(self): ''' Validate the file name structure. ''' diff --git a/tests/data/test_scoring_file_1.txt.gz b/tests/data/test_scoring_file_1.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..cd46417a56e0f016489606f5d512f708d861094f GIT binary patch literal 1071 zcmV+~1kn2*iwFo01`=Zc19W9`bYF90Z*pmFXJ2M%Y-L|DE_8Tw0BuxFZ=*;MJ>y@2 zvb$kS$ihQ9`=MtAP$maEKDYvf4`3gJkn}?!O)LauU@@kKA(Tz zJf^GLr`!AeCVgD(AGW(qdbhn_r;GGbRq0#ToiE>RZl>$!=Vx|4TwaQ6*SaZCaFN-PO&hIHvUp58jLZZZ;qO=0B_Ug8LxG@Qm+ha)%Uc`WiGR+`B@QF0USF%1E-#M9>R2UB_g1yj&3H8Q2@a!#t(Jrm zQqFR4$trnDI1?)&xN4ycK*VI2Lg-O!0(nVplTEUajEoBu0PFRLC9k~hQC0=A4w&1- zWX6yVYT_ajm?d&qY#?Mp^BEVy(ysfoql6%zBNZkxN^n$k|>! zL$D~06~QG+#D|fK*gEkpM3@-uDasF-C)Bs2dUlW>d(>z`h@%tW3dByPlB5EnGQfzH zQ68X8EE%+u$uZ-MH{SLr_S&)^45Q3y%#%l9TC`Xnxtv2=Hp;gdXrcLcMI@eSKm}Gd zT0xod;-dF1dXE#bv7!?Wfx!e@9mup|^LvsIU`Ch?Hh}C2Rw!?@^`jL;SO#$s1dlvT zwi-fF8Bq@H2G{FCIN_N@s@@6hm8h!|)>twkRMkHT0|mn-K<-X6GUmn4Dtgr-IgL1!A?GnOP-PsODWpW`mPGX^AFzHpRy06)Z?7FPx>DqUa)g*z6D!)xIS&aV3K`cX z`rrr)9)^A8%vi=4CL0hFc+`We;8Srmt6y=t?hc9Z+^fP;AUrbe4tXzX3Lh)gD**lLn`JL)m40EBU(?O@Y4`HiV!zpKCmD;#t8&f&^uOWeQk=tl{l_Z*vCp5D2!QAr zYMY{J^Cn#DrVZtMTU^ufvzJyDzao4tu2%*x{|kmgJzqWTa_;0Ex=D5q)$yav+h5l( ze+z9f$&TStpF{rkS)I!H`epf{Xp;RbnyR?8tnBmr7Mi&-Z0~jV_seqiUq9Eu&-du( zy=waz&7X^o+NQ5*Xw%{((rwYE>u^cGNRaqdq-9-wLl6kc&2Acvb5jPU0OZuvOq$!~ z&mD-nHXW5;){`|j0UuZ z`kKrmAaZ`>p){799mIIy&?+#a7HL@w>+C!*&!e;yiz0YlqVL44`=VZ1iVn5aPCGLg zL}3(9RgMi6DKgIYVvGQmCe81foHmKQ2^89q?dM|qxp)a^x|(d>GEsrlzbDEqv|-|& zO@PzNxX}b6M~!SWuaV2rh*_5GE%vm|gK|Ld@L;q-Jq+6mekF+h3@Y$8+SAhWrsF&v zl4n{sGu{J_UE|0iFxJUl2nyvLX&Lofr@SP|EEXw+G`t5JH>)KRhh|D>%VuL(BoODw z_cUf%kk-YL#wy-PKT6gpJ?P$ASJ7+bjeh87&%Cb@(;Rm*#Fm9vX~;g-qqqam!!~(> z7$e>{XKvUIchgqtxNt;Pj0Am6&6soMp;GdA09{|x2y2~D5#sv DlbF69 literal 0 HcmV?d00001 diff --git a/tests/data/test_scoring_file_3.txt.gz b/tests/data/test_scoring_file_3.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..6a2fef35d5b7605e6d95e12e976947db0f44bb4e GIT binary patch literal 876 zcmV-y1C#t8iwFoh2ohre19W9`bYF90Z*pmFXJ2M%Y-L|FE_8Tw0Buvjj^a2FJ%_J| z#AW7yvCA%(JrV~N(Fmy*X1m$Lo}x4{5D7>W($kv1-@?QkR?7v#-S5*JVD*Zz1HTzHe_orqlHP_4Os%<85E|<+(YEraMln=IeTH z%IY>9SLfo;x5aJWboKR^-t$A#hG z>p!dIX1$1(miae!`8{86Ubb(4&UfqWW;A9KMP1Dpkn#2S=2~9DZ1u;o_-j|ZER>W= zd4~F~tox!1ZPWFknr+H9uRf!-s{EbcOWC#zUjG*ihi0~X*%my>JLN{>uTURPebN7J z!|WsUj+=c{ndzkvkxUF20$|4>pWsonHu6goFyU?L?sJ;Blb z)Wu@5`-ir<#XV%kW3@D_x5y?+Nb=4p#x!G~r81n}L`FLmop|9C>*HWVOQ%Ue^v^V}z*3Ub3Z#*akyVuR+ENO~ zlQn{Sgi``FHw?o#Z7uu4XaxmJD8NLs)`P?tKndf>#!7622)qa8aBj;qE#c|J5_4IUhyZJ%DzSQq2a)akY<{y+H0&rf^1EW?~9wz@w%i z9Ao6+Q5dhlA{muvD0z)0T2VrKM+OpPl_ag1#V$brvT2(XiGb8_CtC1N;dv*FX|Oi( zm}VtdH%RgYX`UUcWRt*?W4OB{$3_V{1>n@yxF=@F0zEbk!UB>_>p|^2c=ZfUo@6>A zWD=i8+z*d47ED?=I?t!5qoiTTIZQ6C;*Fq*iYTmgJmLvCBmLuIK^O$CXb|L90qB|S ze%dMx;|tQ#({NqNIg)9omLziNOo~lfgM5Yq#{Ugov5iK6@0@s75&9Ri=dFC{1polF CFTP*^ literal 0 HcmV?d00001 diff --git a/tests/data/test_scoring_file_4.txt.gz b/tests/data/test_scoring_file_4.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..7e57cfe06391dc2d1ce5f6f7ae5c703c0a6c29b0 GIT binary patch literal 1076 zcmV-41k3v$iwFqg2ohre19W9`bYF90Z*pmFXJ2M%Y-L|GE_8Tw0BuxFZ=*;MJ>y@2 zvb$kS$j6g9`=MtAP$maEKDXl|9&0~c%{|)f}tO;UcGw7d_Moa zc}!QgPq+8`P5QXnKWulK^lp2FC9C8q;Y_TA;Hrf(01=a63ZX}_3FIZYO*Y9wGBPeu0Ib&|mb~)1M_CofI$&-S zlNm!gsELbAV3x>bjnm4Y-YQv8rX+YzfN8+k8)dZvinSsm1g=$LGwU^uMlNlQB4>N` z48fu}Rs@$Q5g$e_V(Y}a5Mg4przk&Uo>1S8>e)el>`|i$A&yRfD-b&wO8!$4_x)*N zWt7MHCRPjD$z+ys#v5;Y6nkyi4_;AbHAcy!CM{a5kJX$*M>Z}nJ`QuRquuNO4QX0Yb==_#o{sFVb&qTLs64smb40eL8BeW z>|~hXfM`xb^dw=Rf_OwIjt7Cj{0)svqO9}WW2{2tOfNl!Upja(JQ?A<4lEJoNQu^< zLWi8OMZ<1Npn=~}l?V%sk=GgC;cMhs3HBlkD9z4ADlj?4&QArc(})N_SPKocaf)b= zAu)Q@*9%4mEJTk&C>3%x+5kHdsS#zc{4wxW8*wZ{NI7$C#a(j)CHPE4ln)%}>Zk-! z(^?>=*rq-<7DPySj15#7$7aeY5xON&J<10xpN{m0olo{Tr+#cCF>C#$FVau(mu7t?qUSH{Hqx;|HTx3`z)+uPglysNfpogRwa zuqb!8S@Cu{6lqr7_OnCsQlFD*Ta@`}cX`GyMH#1Ame+Z4N)K^$ybYT4^U5J{em06^}qSvjRgGn##y zCVy^{`w2z}V+~MUrg@!|=3JDu$wtfc9A~dCv@HE~fsg6=4B+Wg(7Y6*>HR8!owUNL z*MBqlZeJ($ud^9Fn>y|Fcji)IwX4x+k=n1;@j5njoF0bpGOgp8ImKTHBKYet&WijFqaj9A-Bt{9 zr3L^D5R!@xIBfW1{PeI|&i`?H#owo!@qDrF^?w{6SL;r|&03b$W(4YFfaHLv%QAFo zyHB?Ja`gLVQ@)y;yO^Rs;+y;QP}vvsug^Re)&DcgYW_U}E!Q;n0qD*nk$HJB+d4@P z2Xin{QSVK8d3lGL@<;n)QWdY|*4R%@5S}1qAE0I*;5JjPti-rcfVGc2t}nobh!~QD zQyJk1Bf*Il(InbLXb24)8%^l@NE%&MviQ%?jLAjf*HYDTbwYBm5j#GEE>om zWHu#oMk5ycRv0$@TBW)eeSlO{0E6$Q8@7Lq+J${E6f+nU;XOGI3jO=J>DsoH6N z!tQi{3#o*5xMUR4vK`Pvg&_?dG$3I6c?zhi>koz6N=%$LTxbYuOoa3j4cLBYNW&m$ za7++G3bbYyHL0nPZL57k_vDyhiYabkq9`Y-v6?Fzr!W{u+qVJP?}4O}E4P&p&@X;9 z8i$>sOzjo~p{rkr%+prlGuo-Xpj1g_8 z;Z-W&`t?#^^JXKcCrp^@sO20EHS=arK=zG+xKPBR98nDe>cg=x7nNjf_Xk`*Dxs(* z(%}MTA*B%3`v4+@L#sT)hHQUiFz~RefRsey_X~gyflPu6ARzlvNW$fy8<-LyTwKk~ zLE93L;zcl?cM6XjdyB`x!#g@bkyPN)wISK9yRc_B9fc{jgxF!Cn@IePGbL zq0X T-uw#KEw%pu^t^ev3Jd@MqV`|+ literal 0 HcmV?d00001 diff --git a/tests/data/test_scoring_file_hmpos_37_2.txt.gz b/tests/data/test_scoring_file_hmpos_37_2.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..8acaa73155ffa989b728af49308825bbcaeb26ab GIT binary patch literal 1157 zcmV;01bX`)iwFqQ5Eo+r19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUo$sfGA?v@bO3c# zO>g2z5Ir})q7^PP(gOYc5y=OJAq*0MY+w(&7t4b?fCL!X&P1ct{`al!wlj9L%f%nh zudC|St714DJ}tI*Hr-8En+4v^HqXoT0zWKQb3DS8bNE=-x9WZ}X|Hc@Z{ypc+Gllk zE)L_OJWTT9?RqY(rRCnr|Y4;;+LYtS)QNjQ*q7C_;iIAm&`v6hVeH2 z;mQi06E-FTNNThCb1`2pXCb9I{sdWm#>@4?=J|2DTW;2aAtaHW@&o`9e(}vUySQZW zWuE@NOCM&G5K1jTb(x*&v~;(ktX-b0vm4G|Luz^UvxP6&?FQiWKSB3WB=d(&3X!a# zW-xqnr^B&M>z_B5e0Oy=7#`fUxVZHD>*<^)i|5&q>%s6xR-UqJ4Vk@OzPmEvzP#<` zkLTI^V?DQ6&r@5^{!|Bzl2zuZHQr)Z#W8Lcg3F=Vf6DD=f&xsk})OROe#U8 zF(Uv*glNM@5(0jkK0j{O%fCV_{B^#YE?3*Z@Z05Sv;B~8e3xgnOW<@iB6`G3`x-j6 zKc@R*nf!gXD_`B+J?7*$eD{!@EB`?L>3?}Es{jA8td?JqdwFsDI?c{!cXsVl)D|5t zX;r+Id*|=M*Kmd4{sJfa3s}jOoBHU=X*dB6Wz=Y;4B8^qQaAK_yB7`AUQldMI76b+r!dR_5gEDIs zL({;ZN|-LA0euXvEjJu38je;xMuBWO`@r#_=SWK|lfa-vaV{l98F`9gaQ}W(l14DafhQzd167r0S5n(739a4)1l7Y;O&rDmvpc^5K(6*mg z14fONWGXg>Y02Q|CX5!UD_V;NlC(w}H1+imC4_G35!{$Arv-t-iDnQi1Y$&3>&pQO z*GkA9C15@Zg~naHA(r>bY2auyCOBz~ri_BK77a`BUOf%!ZDRaa z6F&@!SOS9{HK4Yqc8f9a0yR=g$-5ZFozSSKc= z2H#o9-Zu}0HOqRgYhX~&6D2o_M;e1{+YzTyk^UexFem|Q1{NbM;$7SfN|MG7;4>=v Xiy?qvD!@%J<9n=f1V=K literal 0 HcmV?d00001 diff --git a/tests/data/test_scoring_file_hmpos_37_3.txt.gz b/tests/data/test_scoring_file_hmpos_37_3.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..601865aa1206ab2ba7bfdcf55f289c77e2cb01b4 GIT binary patch literal 973 zcmV;;12X&{iwFo_1{Y%h19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUo$Rrcys`Ll}(T0 zI1q-<;a6Cl%Z#)be>-j|2bQ53BnEncJ?vhhG%*k<5R`;LqtX8NwVkF%?dhZs?bv=F zSCy;EgkktR+knY91m7fpy5$wwfq*<0Xd3jC?ke|)MDYdgf*nTG8byb^r zOduKt-p=8^VM4#I;zmnWUPT=OD}XY0w4%0c)&t@8BT7|*V!7hT2FhPPS$y`D_{ zcrI-`&-ZwCdE@RVUZfUnzy|0Bqy+@av;i}H2A>Eb_%j4qnLlC#F{06^W|&9O&}2=5#Vda!8{4vdPr$nCy;K z{PSU3UG>8wpy&_q@RSy{ouL2TD=%f;tu)8DdV}=rhdFEA=1@v^VO_8ldN9Kgaj<}#q8_)M z@Hbf9VbhI0x9x;JOer(hH3<~tO!Qf{LLcUcx;$fu73KzxI$$gGVI^A2=9#0s$FS9T ztPqH~kQo#a>&cW8dax7>IjX v$jmF$9kUa9uu|gw2niZR-LMnBhfP5e`dVNL^`xp5{(}Dp78*MM3&CTWd^77I@KAl#1 zo$t!0epx-;tjm|fuFTh`o9=p-J=VwUv?{CZ;puWsAImDu*XwP)Ef4uF-5xlw&;4De z+x*Rb7S)OG2iO-K7B#>7b26SyM?um!{m3dmrPJB%{NaAMoX%&RE{n*v>j58#@M|K6 zd|wPEU&h(*%j|XpLI5~Es;hikXH{`5tGZYZX8AE)KL=^+{MY4ppC6BWc=*r5;;|fz zZ|51m$yusSna6m{O|J{5;@Ut~X?x4ZRV@-W&+>U4kR)iyuWtn7LJ zqo@Ybr?;5@@h}>{&F9$U^Zk;~YFmdI4eoOPXq_(7qE7Q&Kb_@uIw=n6M-&L3`supd zeg@G8A??jcfws2C2R$Lk%AV#Iej7gA&u7!W!n@M1AA=cA?($!4FO%I9iT_Vq;4T+tIc;)vlAq!r{zNP`tt`k)pLv|)x}BSR?#(nSn0mLyT50+d6fK&zFJ zLU9_Sq~|F>kh2zI5fR#eaHu>7S|zkJ97HKV5wM6+BSLE{`8cA8!6bx4&mgrDV6?B0 zuO0~uc{D0W5ZO497;P1Rq^6$QkTouFkdiFi*VFYaa9a9ZIa{>VWSgF#;TnY+ORf|F zhOPz~0vw>Bt5jMv7y@O34B>dt&ruXgSZ0`9ILJvuu{B~i@))X6QDu}>R)?UK8f{~Z zP!27@S1oc^3zo(>6Bs0SSld8r3|C3F#?UN^(~k?(B`l#H-Xb3=)kA z8k+5mMxe1VK?DXK(q?aJBYBuP$a`1%x=mi!CURNgM#Na@{pSgsVrwd|i_N43%BYos zouFa~5g3(nCR9Q3M>;G)EfpEISi7sxpl0XBZh8fy<GK_aq1YRUpW{9wU^=fS9k~_v5-bbzDZ-s0PsZd#~?bTdH tXlb#j10a`QfaRzGo@cE>9*wDuv7Fr1erVCcvS-oCj002s9l0*Oi literal 0 HcmV?d00001 diff --git a/tests/data/test_scoring_file_hmpos_38_2.txt.gz b/tests/data/test_scoring_file_hmpos_38_2.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..8c1ec1853483ed1897a38b47b5afcb5559ba0f7f GIT binary patch literal 1163 zcmV;61a$i!iwFqQ5Eo+r19W9`bYF90Z*pmFXJ2M%Y-L|)ZE$aMUo$vgGA?v@bO41_ zO>g2z5Ivh;VTH?#v_MyXcO(uBLl`6inZO=)FP4KlfMhT-W}?w*|NB;V+lf6IX}S30 z`E^yjdR6rM{ipdRo=&!t<$4}(rt9a$Y92o2gH`n;JDC0ECj@7X^r+IulAAwpZl@Ppj+rZO zv;OmWI{USrOQ`3msb_bryhh_?>Znz`iES09`7mCkRXn%n_!A`*J`dxpINk#p0%3Y| zHM)vj9?y4lsbDP`d zDX1JBPRX^nl{@S1{MUHJG~5M`cNa93E!Wk-mJ@#h9!fzgDRs1s$dCe4nnttLX7>1H z8GV2L`75N7$y`58cm}C3Mv3Nv`7u=Nk)NYNMXbn&AAUljBqD?Fd7gUz%{zj zI0jH-B#kDX0g~%BqXu0J4j_q)<~2u27NTg)8U2OhK%Wh6Z zP)`xYpjlBAEi@xRF(MR-)>f+ zG(vaqox*Ayg-U|DcG}}n7e@m#-f5sD#jW7NI12h%cXF#yMYk{)R?mY0pgPMK|oUJ?gNF&&h7{pjhuXf z6P(e>$PI%ehT!-G4Zw8NZZL{oU=-&Rm%N)pg~BLV8ynp(JGwP6MCc@iL#su!^;?H> z&1(CAVnlSdjt51Aj^~;x?wjiV4iyPj0Oe++-P9UokrCKaVVrDeSV4_>&y93;TZ8fl zoRe@Km`@GAGlITv9>6tAJFcr?R1o(YjyoO`XgF288F3JjbO))1Q3;%70caN=4rgBq dQriK22E4l%Jm~k2Kp&ZC@*m3A;W(KGk-Y)^Pa6OL4K*mmEchPPH#iDHivSmo}1cO1}y+@=~vbz+0$3>OxyGz#Y>ZUv&h7s-S2IWOjc4d9eD^#9g;gt8Y(Kx(j-(Azf zJY|@SAgJ}zAB*{FIrAaS(I<%VGg_`5*Dv3u+vR#S8bc6SS){N*{1aBr`N^e=U+39x z+w5^h38B=&R@dZZmo@HEH=QfeRenLm)rVH(KZfm7e!0NrIlAfgbvl1sXK<1=)QrY& zt~?yOtow0s>8tDV(fHub^~q(g*Rm?o#mnr-^=SM(Z_51KL1fp{t7}r;r?*-E{xX}t z&*u{Ic^>lFm7T9q`jq#dc4&iKhw=)eRoFW=Uy<)8j6^ku%CE}u4|@z>Mydh_pq&9GMFol8NTO^BW_J1j$|cE@aY zY|@{1+ve)-9?0kfFFLnDrI0*XACH)KF-YALN@7=D;@~U!`OX}|E znqhL=3Oz`Ur>w27&Cd02O}qTkk08|#pwAa4Q5Odni#6MRvu+isbrB%6N!nG8c_WX)zsWj%_xA2gkhrWFFwUqIIS`}2`7r|vj)JhXXl_J82DzGvl zQY%Io3#e9t`11|6ghW&sV~PY6+IDk`lvaFn3uFA2m9|{@W>Kt!HZfGn;1)xLxRq54 z9Vshg5Mw|U`^T>ESwxjH;Z+rZs`*+;Oqq^&5YFemS&CAjy(-7OhXJ+kQLS`9H6K)& zU=d?6#pO*E%y|2F4DkfCl3W#b+NV-p$qw{T{3NXrOe xd;vHXa9=*g%0bq6)LJ!M7sZW Date: Wed, 14 Sep 2022 14:26:13 +0100 Subject: [PATCH 04/46] Catch ftp download errors --- .../download/download_scorefile.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index fc35529..f31c7ab 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -3,9 +3,11 @@ import os import shutil import textwrap +import time from contextlib import closing from functools import reduce from urllib import request as request +from urllib.error import HTTPError, URLError from pgscatalog_utils.download.publication import query_publication from pgscatalog_utils.download.score import get_url @@ -62,14 +64,26 @@ def _mkdir(outdir: str) -> None: os.makedirs(outdir) -def _download_ftp(url: str, path: str) -> None: +def _download_ftp(url: str, path: str, retry:int = 0) -> None: if os.path.exists(path): logger.warning(f"File already exists at {path}, skipping download") return else: - with closing(request.urlopen(url)) as r: - with open(path, 'wb') as f: - shutil.copyfileobj(r, f) + try: + with closing(request.urlopen(url)) as r: + with open(path, 'wb') as f: + shutil.copyfileobj(r, f) + except (HTTPError, URLError) as error: + max_retries = 5 + print(f'Download failed: {error.reason}') + # Retry to download the file if the server is busy + if '421' in error.reason and retry < max_retries: + print(f'> Retry to download the file ... attempt {retry+1} out of {max_retries}.') + retry += 1 + time.sleep(10) + _download_ftp(url,path,retry) + else: + raise RuntimeError("Failed to download '{}'.\nError message: '{}'".format(url, error.reason)) def _check_args(args): From 5fe9fad79cd9c7d7e9c8b53407c2c209de77e33e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 15 Sep 2022 13:26:07 +0100 Subject: [PATCH 05/46] pin polars to 0.14.9 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b9899ab..b8262b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ pandas = "^1.4.3" pyliftover = "^0.4" requests = "^2.28.1" jq = "^1.2.2" -polars = "^0.14.9" +polars = "0.14.9" [tool.poetry.dev-dependencies] pytest = "^7.1.2" From cd7ee7812236476321dcb1dbccc8147147651f21 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 15 Sep 2022 13:55:38 +0100 Subject: [PATCH 06/46] bump version for next release --- pgscatalog_utils/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py index 10939f0..8ce9b36 100644 --- a/pgscatalog_utils/__init__.py +++ b/pgscatalog_utils/__init__.py @@ -1 +1 @@ -__version__ = '0.1.2' +__version__ = '0.1.3' diff --git a/pyproject.toml b/pyproject.toml index b8262b2..65786fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pgscatalog_utils" -version = "0.1.2" +version = "0.1.3" description = "Utilities for working with PGS Catalog API and scoring files" homepage = "https://github.com/PGScatalog/pgscatalog_utils" authors = ["Benjamin Wingfield ", "Samuel Lambert "] From 96896e36df7edd69068b1c57d5cb6e3381e7502d Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Fri, 16 Sep 2022 15:11:00 +0100 Subject: [PATCH 07/46] Handle PGS Catalog REST API errors and retries --- pgscatalog_utils/download/publication.py | 10 ++++---- pgscatalog_utils/download/score.py | 32 +++++++++++++++++++++--- pgscatalog_utils/download/trait.py | 10 ++++---- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py index 843b8a2..56c7f7b 100644 --- a/pgscatalog_utils/download/publication.py +++ b/pgscatalog_utils/download/publication.py @@ -1,20 +1,20 @@ import logging from functools import reduce -import requests +from pgscatalog_utils.download.score import query_api logger = logging.getLogger(__name__) def query_publication(pgp: str) -> list[str]: - api: str = f'https://www.pgscatalog.org/rest/publication/{pgp}' logger.debug("Querying PGS Catalog with publication PGP ID") - r: requests.models.Response = requests.get(api) + api: str = f'/publication/{pgp}' + results_json = query_api(api) - if r.json() == {}: + if results_json == {} or results_json == None: logger.critical(f"Bad response from PGS Catalog for EFO term: {pgp}") raise Exception - pgs: dict[str, list[str]] = r.json().get('associated_pgs_ids') + pgs: dict[str, list[str]] = results_json.get('associated_pgs_ids') logger.debug(f"Valid response from PGS Catalog for PGP ID: {pgp}") return list(reduce(lambda x, y: set(x).union(set(y)), pgs.values())) diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py index a38dc0c..edad470 100644 --- a/pgscatalog_utils/download/score.py +++ b/pgscatalog_utils/download/score.py @@ -3,6 +3,7 @@ import jq import requests +import time logger = logging.getLogger(__name__) @@ -28,11 +29,36 @@ def get_url(pgs: list[str], build: str) -> dict[str, str]: return dict(zip(pgs_result, url_result)) +def query_api(api: str, retry:int = 0) -> dict: + max_retries = 5 + wait = 60 + results_json = None + rest_url_root = 'https://www.pgscatalog.org/rest' + try: + r: requests.models.Response = requests.get(rest_url_root+api) + r.raise_for_status() + results_json = r.json() + except requests.exceptions.HTTPError as e: + print(f'HTTP Error: {e}') + if r.status_code in [421,429] and retry < 5: + retry +=1 + print(f'> Retry to query the PGS Catalog REST API in {wait}s ... attempt {retry} out of {max_retries}.') + time.sleep(wait) + results_json = query_api(api,retry) + except requests.exceptions.ConnectionError as e: + print(f'Error Connecting: {e}') + except requests.exceptions.Timeout as e: + print(f'Timeout Error: {e}') + except requests.exceptions.RequestException as e: + print(f'Request Error: {e}') + return results_json + + def query_score(pgs_id: list[str]) -> dict: pgs: str = ','.join(pgs_id) - api: str = f'https://www.pgscatalog.org/rest/score/search?pgs_ids={pgs}' - r: requests.models.Response = requests.get(api) - return r.json() + api: str = f'/score/search?pgs_ids={pgs}' + results_json = query_api(api) + return results_json def _chunker(pgs: list[str]): diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py index c2db495..83af414 100644 --- a/pgscatalog_utils/download/trait.py +++ b/pgscatalog_utils/download/trait.py @@ -1,24 +1,24 @@ import logging from functools import reduce -import requests +from pgscatalog_utils.download.score import query_api logger = logging.getLogger(__name__) def query_trait(trait: str) -> list[str]: - api: str = f'https://www.pgscatalog.org/rest/trait/{trait}?include_children=1' logger.debug(f"Querying PGS Catalog with trait {trait}") - r: requests.models.Response = requests.get(api) + api: str = f'/trait/{trait}?include_children=1' + results_json = query_api(api) - if r.json() == {}: + if results_json == {} or results_json == None: logger.critical(f"Bad response from PGS Catalog for EFO term: {trait}") raise Exception keys: list[str] = ['associated_pgs_ids', 'child_associated_pgs_ids'] pgs: list[str] = [] for key in keys: - pgs.append(r.json().get(key)) + pgs.append(results_json.get(key)) logger.debug(f"Valid response from PGS Catalog for EFO term: {trait}") return list(reduce(lambda x, y: set(x).union(set(y)), pgs)) From 50b9e9e1ee2b7242059e7865fd08e74a1ba460f4 Mon Sep 17 00:00:00 2001 From: Sam Lambert Date: Tue, 20 Sep 2022 11:16:23 +0100 Subject: [PATCH 08/46] Update README.md --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d19c186..7c897b8 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,8 @@ [![CI](https://github.com/PGScatalog/pgscatalog_utils/actions/workflows/main.yml/badge.svg)](https://github.com/PGScatalog/pgscatalog_utils/actions/workflows/main.yml) -This repository is a collection of useful tools for working with data from the -PGS Catalog. This is mostly used internally by the PGS Catalog calculator, but -other users might find some of these tools helpful. +This repository is a collection of useful tools for downloading and working with scoring files from the +PGS Catalog. This is mostly used internally by the PGS Catalog Calculator ([`PGScatalog/pgsc_calc`](https://github.com/PGScatalog/pgsc_calc)); however, other users may find some of these tools helpful. ## Overview @@ -66,4 +65,4 @@ doi:[10.1038/s41588-021-00783-5](https://doi.org/10.1038/s41588-021-00783-5). This work has received funding from EMBL-EBI core funds, the Baker Institute, the University of Cambridge, Health Data Research UK (HDRUK), and the European Union's Horizon 2020 research and innovation programme -under grant agreement No 101016775 INTERVENE. \ No newline at end of file +under grant agreement No 101016775 INTERVENE. From 268e96aef7f17ea4f72094bc7e0060a3c614ea96 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 21 Sep 2022 14:53:20 +0100 Subject: [PATCH 09/46] add memory profiler to development dependencies --- poetry.lock | 183 ++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 5 +- 2 files changed, 186 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index d776774..b8afbdd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -28,6 +28,17 @@ category = "main" optional = false python-versions = ">=3.6" +[[package]] +name = "cffi" +version = "1.15.1" +description = "Foreign Function Interface for Python calling C code." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pycparser = "*" + [[package]] name = "charset-normalizer" version = "2.1.0" @@ -47,6 +58,24 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "contourpy" +version = "1.0.5" +description = "Python library for calculating contours of 2D quadrilateral grids" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +numpy = ">=1.16" + +[package.extras] +test-no-codebase = ["pillow", "matplotlib", "pytest"] +test-minimal = ["pytest"] +test = ["isort", "flake8", "pillow", "matplotlib", "pytest"] +docs = ["sphinx-rtd-theme", "sphinx", "docutils (<0.18)"] +bokeh = ["selenium", "bokeh"] + [[package]] name = "coverage" version = "6.4.4" @@ -61,6 +90,36 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] +[[package]] +name = "cycler" +version = "0.11.0" +description = "Composable style cycles" +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "fonttools" +version = "4.37.3" +description = "Tools to manipulate font files" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"] +graphite = ["lz4 (>=1.7.4.2)"] +interpolatable = ["scipy", "munkres"] +lxml = ["lxml (>=4.0,<5)"] +pathops = ["skia-pathops (>=0.5.0)"] +plot = ["matplotlib"] +repacker = ["uharfbuzz (>=0.23.0)"] +symfont = ["sympy"] +type1 = ["xattr"] +ufo = ["fs (>=2.2.0,<3)"] +unicode = ["unicodedata2 (>=14.0.0)"] +woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"] + [[package]] name = "idna" version = "3.3" @@ -85,6 +144,45 @@ category = "main" optional = false python-versions = ">=3.5" +[[package]] +name = "kiwisolver" +version = "1.4.4" +description = "A fast implementation of the Cassowary constraint solver" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "matplotlib" +version = "3.6.0" +description = "Python plotting package" +category = "dev" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +contourpy = ">=1.0.1" +cycler = ">=0.10" +fonttools = ">=4.22.0" +kiwisolver = ">=1.0.1" +numpy = ">=1.19" +packaging = ">=20.0" +pillow = ">=6.2.0" +pyparsing = ">=2.2.1" +python-dateutil = ">=2.7" +setuptools_scm = ">=7" + +[[package]] +name = "memory-profiler" +version = "0.60.0" +description = "A module for monitoring memory usage of a python program" +category = "dev" +optional = false +python-versions = ">=3.4" + +[package.dependencies] +psutil = "*" + [[package]] name = "numpy" version = "1.23.1" @@ -120,6 +218,18 @@ pytz = ">=2020.1" [package.extras] test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +[[package]] +name = "pillow" +version = "9.2.0" +description = "Python Imaging Library (Fork)" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] + [[package]] name = "pluggy" version = "1.0.0" @@ -149,6 +259,17 @@ xlsx2csv = ["xlsx2csv (>=0.8.0)"] pytz = ["pytz"] pyarrow = ["pyarrow (>=4.0)"] +[[package]] +name = "psutil" +version = "5.9.2" +description = "Cross-platform lib for process and system monitoring in Python." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"] + [[package]] name = "py" version = "1.11.0" @@ -157,6 +278,14 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "pycparser" +version = "2.21" +description = "C parser in Python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + [[package]] name = "pyliftover" version = "0.4" @@ -257,6 +386,23 @@ urllib3 = ">=1.21.1,<1.27" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "setuptools-scm" +version = "7.0.5" +description = "the blessed package to manage your versions by scm tags" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +packaging = ">=20.0" +tomli = ">=1.0.0" +typing-extensions = "*" + +[package.extras] +test = ["pytest (>=6.2)", "virtualenv (>20)"] +toml = ["setuptools (>=42)"] + [[package]] name = "six" version = "1.16.0" @@ -273,6 +419,14 @@ category = "dev" optional = false python-versions = ">=3.7" +[[package]] +name = "typing-extensions" +version = "4.3.0" +description = "Backported and Experimental Type Hints for Python 3.7+" +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "urllib3" version = "1.26.11" @@ -286,18 +440,36 @@ brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +[[package]] +name = "zstandard" +version = "0.18.0" +description = "Zstandard bindings for Python" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\""} + +[package.extras] +cffi = ["cffi (>=1.11)"] + [metadata] lock-version = "1.1" python-versions = "^3.10" -content-hash = "607d2d543f52a4ecc116c0b912c499a83cd1c740244323c81fdfe89ba27a55eb" +content-hash = "a0d60a1fec35d248340f1640db49d07a7000b23e4bbe22426a9c240ee499c334" [metadata.files] atomicwrites = [] attrs = [] certifi = [] +cffi = [] charset-normalizer = [] colorama = [] +contourpy = [] coverage = [] +cycler = [] +fonttools = [] idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, @@ -307,6 +479,9 @@ iniconfig = [ {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] jq = [] +kiwisolver = [] +matplotlib = [] +memory-profiler = [] numpy = [] packaging = [] pandas = [ @@ -332,15 +507,18 @@ pandas = [ {file = "pandas-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e"}, {file = "pandas-1.4.3.tar.gz", hash = "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c"}, ] +pillow = [] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, ] polars = [] +psutil = [] py = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] +pycparser = [] pyliftover = [ {file = "pyliftover-0.4.tar.gz", hash = "sha256:72bcfb7de907569b0eb75e86c817840365297d63ba43a961da394187e399da41"}, ] @@ -357,6 +535,7 @@ pytz = [ {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, ] requests = [] +setuptools-scm = [] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -365,4 +544,6 @@ tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +typing-extensions = [] urllib3 = [] +zstandard = [] diff --git a/pyproject.toml b/pyproject.toml index 65786fe..23caf20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,12 +18,15 @@ pandas = "^1.4.3" pyliftover = "^0.4" requests = "^2.28.1" jq = "^1.2.2" -polars = "0.14.9" +polars = "^0.14.9" +zstandard = "^0.18.0" [tool.poetry.dev-dependencies] pytest = "^7.1.2" pytest-cov = "^3.0.0" pysqlar = "^0.1.2" +memory-profiler = "^0.60.0" +matplotlib = "^3.6.0" [build-system] requires = ["poetry-core>=1.0.0"] From 012ff6de7dc076d5218073b7af4a48f522f6c135 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 21 Sep 2022 14:57:57 +0100 Subject: [PATCH 10/46] add support for reading zstd compressed targets --- pgscatalog_utils/match/preprocess.py | 6 +- pgscatalog_utils/match/read.py | 98 +++++----------------------- pgscatalog_utils/target.py | 87 ++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 84 deletions(-) create mode 100644 pgscatalog_utils/target.py diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 1723f6d..4d93090 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -27,7 +27,7 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF return df -def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) -> pl.DataFrame: +def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format: str) -> pl.DataFrame: # plink2 pvar multi-alleles are comma-separated df: pl.DataFrame = (df.with_column( pl.when(pl.col("ALT").str.contains(',')) @@ -35,10 +35,10 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, pvar: bool) .otherwise(pl.lit(False)) .alias('is_multiallelic'))) - if df['is_multiallelic'].sum() > 0: + if df.select('is_multiallelic').sum() > 0: logger.debug("Multiallelic variants detected") if remove_multiallelic: - if not pvar: + if file_format == "bim": logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic " "variant representations only") logger.debug('Dropping multiallelic variants') diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index fd1a4c3..c25175a 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -1,104 +1,40 @@ import glob import logging -from typing import NamedTuple import polars as pl from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles +from pgscatalog_utils.target import Target logger = logging.getLogger(__name__) -def read_target(path: str, remove_multiallelic: bool, single_file: bool = False, - chrom: str = "") -> pl.DataFrame: - target: Target = _detect_target_format(path) - d = {'column_1': str} # column_1 is always CHROM. CHROM must always be a string +def read_target(path: str, remove_multiallelic: bool) -> pl.DataFrame: + """ Read one or more targets from a path (may contain a wildcard) """ - if single_file: - logger.debug(f"Scanning target genome for chromosome {chrom}") - # scan target and filter to reduce memory usage on big files - df: pl.DataFrame = ( - pl.scan_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) - .filter(pl.col('column_1') == chrom) - .collect()) - - if df.is_empty(): - logger.warning(f"Chromosome missing from target genome: {chrom}") - return df + if '*' in path: + logger.debug("Wildcard detected in target path: finding all matching files") + paths: list[str] = glob.glob(path) else: - logger.debug(f"Reading target {path}") - df: pl.DataFrame = pl.read_csv(path, sep='\t', has_header=False, comment_char='#', dtype=d) + logger.debug("") + paths: list[str] = [path] - df.columns = target.header + targets: list[Target] = [Target.from_path(x) for x in paths] + dfs: list[pl.DataFrame] = [] + for target in targets: + assert target.file_format in ['bim', 'pvar'] + dfs.append(target.read().pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, + file_format=target.file_format)) - match target.file_format: - case 'bim': - return (df.select(_default_cols()) - .filter(pl.col('ID') != '.') # remove missing IDs - .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=False)) - case 'pvar': - return (df.select(_default_cols()) - .filter(pl.col('ID') != '.') - .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, pvar=True)) - case _: - logger.error("Invalid file format detected") - raise Exception + return pl.concat(dfs).filter(pl.col("ID") != '.') def read_scorefile(path: str) -> pl.DataFrame: logger.debug("Reading scorefile") scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str}) - .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele']) - .with_columns([ + .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele']) + .with_columns([ pl.col('accession').cast(pl.Categorical), pl.col("effect_type").cast(pl.Categorical)])) return scorefile - - -class Target(NamedTuple): - """ Important summary information about a target genome. Cheap to compute (just reads the header). """ - file_format: str - header: list[str] - - -def _detect_target_format(path: str) -> Target: - file_format: str - header: list[str] - - if "*" in path: - logger.debug("Detecting target file format") - path = glob.glob(path)[0] # guess format from first file in directory - - with open(path, 'rt') as f: - for line in f: - if line.startswith('#'): - logger.debug("pvar format detected") - file_format = 'pvar' - header = _pvar_header(path) - break - else: - logger.debug("bim format detected") - file_format = 'bim' - header = _bim_header() - break - - return Target(file_format, header) - - -def _default_cols() -> list[str]: - return ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] # only columns we want from a target genome - - -def _pvar_header(path: str) -> list[str]: - """ Get the column names from the pvar file (not constrained like bim, especially when converted from VCF) """ - line: str = '#' - with open(path, 'rt') as f: - while line.startswith('#'): - line: str = f.readline() - if line.startswith('#CHROM'): - return line.strip().split('\t') - - -def _bim_header() -> list[str]: - return ['#CHROM', 'ID', 'CM', 'POS', 'REF', 'ALT'] diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py new file mode 100644 index 0000000..6b28998 --- /dev/null +++ b/pgscatalog_utils/target.py @@ -0,0 +1,87 @@ +import zstandard +from dataclasses import dataclass +import io +import logging +import polars as pl + +logger = logging.getLogger(__name__) + + +@dataclass +class Target: + """ Class to detect and read a plink1/plink2 variant information file """ + file_format: str = None + header: list[str] = None + path: str = None + compressed: bool = False + + @classmethod + def from_path(cls, path): + """ Create a Target object from a path. Cheaply detect file format and headers. """ + try: + with open(path, 'r') as f: + file_format, header = _get_header(f) + compressed = False + except UnicodeDecodeError: + logger.error("Can't open target as a text file, so trying to read zstd compressed binary file") + with open(path, 'rb') as f: + dctx = zstandard.ZstdDecompressor() + stream_reader = dctx.stream_reader(f) + text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8') + file_format, header = _get_header(text_stream) + compressed = True + + return cls(file_format=file_format, path=path, header=header, compressed=compressed) + + def read(self) -> pl.DataFrame: + """ Read variant information into a polars df (expensive operation). Automatically handle compressed data. """ + # column_1 is always CHROM, which must always be a string or X/Y/MT/PAR will break inferred dtypes + logger.debug("Reading target into memory") + chrom_dtype = {'column_1': str} + if self.compressed: + with open(self.path, 'rb') as f: + dctx = zstandard.ZstdDecompressor() + with dctx.stream_reader(f) as reader: + df: pl.DataFrame = pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', dtype=chrom_dtype) + df.columns = self.header + return df.select(_default_cols()) + else: + df: pl.DataFrame = pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', dtype=chrom_dtype) + df.columns = self.header + return df.select(_default_cols()) + + +def _get_header(fh) -> tuple[str, list[str]]: + header = None + file_format = None + logger.debug(f"Scanning header to get file format and column names") + for line in fh: + if line.startswith('#'): + logger.debug("pvar format detected") + file_format = 'pvar' + header = _pvar_header(fh) + break + else: + logger.debug("bim format detected") + file_format = 'bim' + header = _bim_header() + break + + return file_format, header + + +def _pvar_header(fh) -> list[str]: + """ Get the column names from the pvar file (not constrained like bim, especially when converted from VCF) """ + line: str = '#' + while line.startswith('#'): + line: str = fh.readline() + if line.startswith('#CHROM'): + return line.strip().split('\t') + + +def _bim_header() -> list[str]: + return ['#CHROM', 'ID', 'CM', 'POS', 'REF', 'ALT'] + + +def _default_cols() -> list[str]: + return ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] # only columns we want from a target genome From 79759510f95e5a448517b05c20d8d2dfd737be9e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 21 Sep 2022 14:58:10 +0100 Subject: [PATCH 11/46] remove single match mode, scan_csv not compatible with bytesIO (zstd) --- pgscatalog_utils/match/match_variants.py | 31 ++++-------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 336d781..dd4ec4e 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -6,9 +6,9 @@ import polars as pl from pgscatalog_utils.log_config import set_logging_level +from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.log import make_logs from pgscatalog_utils.match.match import get_all_matches -from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.read import read_target, read_scorefile from pgscatalog_utils.match.write import write_out, write_log @@ -28,18 +28,12 @@ def match_variants(): n_target_files = len(glob(args.target)) matches: pl.DataFrame - if n_target_files == 1 and not args.fast: - match_mode: str = 'single' - elif n_target_files > 1 and not args.fast: + if n_target_files > 1 and not args.fast: match_mode: str = 'multi' - elif args.fast: + else: match_mode: str = 'fast' match match_mode: - case "single": - logger.debug(f"Match mode: {match_mode}") - matches = _match_single_target(args.target, scorefile, args.remove_multiallelic, args.skip_flip, - args.remove_ambiguous, args.keep_first_match) case "multi": logger.debug(f"Match mode: {match_mode}") matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, args.skip_flip, @@ -81,8 +75,7 @@ def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: # fast match is fast because: # 1) all target files are read into memory # 2) matching occurs without iterating through chromosomes - target: pl.DataFrame = read_target(path=target_path, - remove_multiallelic=remove_multiallelic) + target: pl.DataFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic) logger.debug("Split target chromosomes not checked with fast match mode") return get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match) @@ -92,26 +85,12 @@ def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_mu matches = [] for i, loc_target_current in enumerate(glob(target_path)): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') - target: pl.DataFrame = read_target(path=loc_target_current, - remove_multiallelic=remove_multiallelic) + target: pl.DataFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic) _check_target_chroms(target) matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match)) return pl.concat(matches) -def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: - matches = [] - for chrom in scorefile['chr_name'].unique().to_list(): - target = read_target(target_path, remove_multiallelic=remove_multiallelic, - single_file=True, chrom=chrom) # scans and filters - if target: - logger.debug(f"Matching chromosome {chrom}") - matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match)) - - return pl.concat(matches) - - def _description_text() -> str: return textwrap.dedent('''\ Match variants from a combined scoring file against a set of From b6aa2b0899817c518056480f50677e7d9b477a38 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 21 Sep 2022 15:36:04 +0100 Subject: [PATCH 12/46] compress matched scorefiles --- pgscatalog_utils/match/write.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 53eb15f..52253a3 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -37,9 +37,11 @@ def _write_scorefile(effect_type: str, scorefiles: pl.DataFrame, split: bool, ou for k, v in df_dict.items(): chr = k.replace("false", "ALL") - path: str = os.path.join(outdir, f"{dataset}_{chr}_{effect_type}_{i}.scorefile") + path: str = os.path.join(outdir, f"{dataset}_{chr}_{effect_type}_{i}.scorefile.gz") logger.debug(f"Writing matched scorefile to {path}") - v.write_csv(path, sep="\t") + + with gzip.open(path, 'wb') as f: + v.write_csv(f, sep="\t") def _format_scorefile(df: pl.DataFrame, split: bool) -> dict[str, pl.DataFrame]: From 92bd91ef515f023d06a61f01a6853ce280b0ab85 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 21 Sep 2022 16:35:45 +0100 Subject: [PATCH 13/46] skeleton aggreggation --- pgscatalog_utils/aggregate/__init__.py | 0 pgscatalog_utils/score.py | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 pgscatalog_utils/aggregate/__init__.py create mode 100644 pgscatalog_utils/score.py diff --git a/pgscatalog_utils/aggregate/__init__.py b/pgscatalog_utils/aggregate/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog_utils/score.py b/pgscatalog_utils/score.py new file mode 100644 index 0000000..e0e8305 --- /dev/null +++ b/pgscatalog_utils/score.py @@ -0,0 +1,12 @@ +import zstandard +from dataclasses import dataclass +import io +import logging +import polars as pl + +logger = logging.getLogger(__name__) + + +@dataclass +class Score: + """ A class that represents calculated scores (.sscore)""" From da0105e19f3407ef910523b6a57cfeb49ce645ab Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 22 Sep 2022 11:23:28 +0100 Subject: [PATCH 14/46] add aggregate_score --- .../aggregate/aggregate_scores.py | 92 +++++++++++++++++++ pgscatalog_utils/score.py | 12 --- pyproject.toml | 1 + 3 files changed, 93 insertions(+), 12 deletions(-) create mode 100644 pgscatalog_utils/aggregate/aggregate_scores.py delete mode 100644 pgscatalog_utils/score.py diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py new file mode 100644 index 0000000..2787680 --- /dev/null +++ b/pgscatalog_utils/aggregate/aggregate_scores.py @@ -0,0 +1,92 @@ +import argparse +import textwrap + +import pandas as pd + +from pgscatalog_utils.log_config import set_logging_level +import glob +import logging + +logger = logging.getLogger(__name__) + + +def aggregate_scores(): + args = _parse_args() + set_logging_level(args.verbose) + df = aggregate(glob.glob(args.scores)) + logger.debug("Compressing and writing combined scores") + df.to_csv('aggregated_scores.txt.gz', sep='\t', compression='gzip') + + +def aggregate(scorefiles: list[str]): + combined = pd.DataFrame() + aggcols = set() + + for i, path in enumerate(scorefiles): + logger.debug(f"Reading {path}") + # pandas can automatically detect zst compression, neat! + df = (pd.read_table(path) + .assign(sampleset=path.split('_')[0]) + .set_index(['sampleset', '#IID'])) + + df.index.names = ['sampleset', 'IID'] + + # Subset to aggregatable columns + df = df[_select_agg_cols(df.columns)] + aggcols.update(set(df.columns)) + + # Combine DFs + if i == 0: + logger.debug('Initialising combined DF') + combined = df.copy() + else: + logger.debug('Adding to combined DF') + combined = combined.add(df, fill_value=0) + + assert all([x in combined.columns for x in aggcols]), "All Aggregatable Columns are present in the final DF" + + return combined.pipe(_calculate_average) + + +def _calculate_average(combined: pd.DataFrame): + logger.debug("Averaging data") + avgs = combined.loc[:, combined.columns.str.endswith('_SUM')].divide(combined['DENOM'], axis=0) + avgs.columns = avgs.columns.str.replace('_SUM', '_AVG') + return pd.concat([combined, avgs], axis=1) + + +def _select_agg_cols(cols): + keep_cols = ['DENOM'] + return [x for x in cols if (x.endswith('_SUM') and (x != 'NAMED_ALLELE_DOSAGE_SUM')) or (x in keep_cols)] + + +def _description_text() -> str: + return textwrap.dedent(''' + Aggregate plink .sscore files into a combined TSV table. + + This aggregation sums scores that were calculated from plink + .scorefiles. Scorefiles may be split to calculate scores over different + chromosomes or effect types. The PGS Catalog calculator automatically splits + scorefiles where appropriate, and uses this script to combine them. + + Input .sscore files can be optionally compressed with zstd or gzip. + + The aggregated output scores are compressed with gzip. + ''') + + +def _parse_args(args=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=_description_text(), + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('-s', '--scores', dest='scores', required=True, + help=' Path to scorefiles. Use a wildcard (*) to select multiple files.') + parser.add_argument('-o', '--outdir', dest='outdir', required=True, + default='scores/', help=' Output directory to store downloaded files') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help=' Extra logging information') + return parser.parse_args(args) + + +if __name__ == "__main__": + aggregate_scores() + diff --git a/pgscatalog_utils/score.py b/pgscatalog_utils/score.py deleted file mode 100644 index e0e8305..0000000 --- a/pgscatalog_utils/score.py +++ /dev/null @@ -1,12 +0,0 @@ -import zstandard -from dataclasses import dataclass -import io -import logging -import polars as pl - -logger = logging.getLogger(__name__) - - -@dataclass -class Score: - """ A class that represents calculated scores (.sscore)""" diff --git a/pyproject.toml b/pyproject.toml index 23caf20..9c7bbb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ readme = "README.md" combine_scorefiles = "pgscatalog_utils.scorefile.combine_scorefiles:combine_scorefiles" download_scorefiles = "pgscatalog_utils.download.download_scorefile:download_scorefile" match_variants = "pgscatalog_utils.match.match_variants:match_variants" +aggregate_scores = "pgscatalog_utils.aggregate.aggregate_scores:aggregate_scores" [tool.poetry.dependencies] python = "^3.10" From 85715dc93ff1fe5b6c0d3202bb178cba32694fbe Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Thu, 22 Sep 2022 14:32:22 +0100 Subject: [PATCH 15/46] Cleanup the code, ignore some user warnings from pandas_schema and updatepoetry files --- .../validate/formatted/validator.py | 34 ++-- .../validate/harmonized_position/validator.py | 2 +- pgscatalog_utils/validate/schemas.py | 1 - .../validate/validate_scorefile.py | 177 ++++++++---------- pgscatalog_utils/validate/validator_base.py | 26 ++- poetry.lock | 13 ++ pyproject.toml | 3 + 7 files changed, 127 insertions(+), 129 deletions(-) diff --git a/pgscatalog_utils/validate/formatted/validator.py b/pgscatalog_utils/validate/formatted/validator.py index eda02cc..1e42336 100644 --- a/pgscatalog_utils/validate/formatted/validator.py +++ b/pgscatalog_utils/validate/formatted/validator.py @@ -3,8 +3,6 @@ from pandas_schema import Schema from pgscatalog_utils.validate.schemas import * from pgscatalog_utils.validate.validator_base import * -# from schemas import * -# from validator_base import * ''' PGS Catalog Harmonized file validator @@ -17,7 +15,7 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): super().__init__(file, score_dir, logfile, error_limit) self.score_dir=None self.meta_format = FORMATTED_META_GENERIC - self.validators = FORMATTED_VALIDATORS + self.schema_validators = FORMATTED_VALIDATORS self.valid_cols = VALID_COLS_FORMATTED self.valid_type = VALID_TYPE_FORMATTED self.setup_field_validation() @@ -91,32 +89,28 @@ def validate_data(self) -> bool: self.get_and_check_variants_number() for chunk in self.df_iterator(self.file): - to_validate = chunk[self.cols_to_read] - to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded + dataframe_to_validate = chunk[self.cols_to_read] + dataframe_to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded # Detect duplicated rows - self.detect_duplicated_rows(to_validate) + self.detect_duplicated_rows(dataframe_to_validate) + # validate the snp column if present if SNP_DSET in self.header: + sub_schema = FORMATTED_VALIDATORS_SNP if CHR_DSET and BP_DSET in self.header: - self.schema = Schema([FORMATTED_VALIDATORS_SNP_EMPTY[h] for h in self.cols_to_validate]) - else: - self.schema = Schema([FORMATTED_VALIDATORS_SNP[h] for h in self.cols_to_validate]) - errors = self.schema.validate(to_validate) - self.store_errors(errors) + sub_schema = FORMATTED_VALIDATORS_SNP_EMPTY + self.validate_schema(sub_schema,dataframe_to_validate) if CHR_DSET and BP_DSET in self.header: - self.schema = Schema([FORMATTED_VALIDATORS_POS[h] for h in self.cols_to_validate]) - errors = self.schema.validate(to_validate) - self.store_errors(errors) + self.validate_schema(FORMATTED_VALIDATORS_POS, dataframe_to_validate) + if OR_DSET in self.header: - self.schema = Schema([FORMATTED_VALIDATORS_OR[h] for h in self.cols_to_validate]) - errors = self.schema.validate(to_validate) - self.store_errors(errors) + self.validate_schema(FORMATTED_VALIDATORS_OR,dataframe_to_validate) + if HR_DSET in self.header: - self.schema = Schema([FORMATTED_VALIDATORS_HR[h] for h in self.cols_to_validate]) - errors = self.schema.validate(to_validate) - self.store_errors(errors) + self.validate_schema(FORMATTED_VALIDATORS_HR,dataframe_to_validate) + self.process_errors() if len(self.bad_rows) >= self.error_limit: break diff --git a/pgscatalog_utils/validate/harmonized_position/validator.py b/pgscatalog_utils/validate/harmonized_position/validator.py index b46e8c4..87b9346 100644 --- a/pgscatalog_utils/validate/harmonized_position/validator.py +++ b/pgscatalog_utils/validate/harmonized_position/validator.py @@ -13,7 +13,7 @@ class ValidatorPos(ValidatorBase): def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): super().__init__(file, score_dir, logfile, error_limit) self.meta_format = HM_META_POS - self.validators = POS_VALIDATORS + self.schema_validators = POS_VALIDATORS self.valid_cols = VALID_COLS_POS self.valid_type = VALID_TYPE_POS self.setup_field_validation() diff --git a/pgscatalog_utils/validate/schemas.py b/pgscatalog_utils/validate/schemas.py index 7487b21..43e8e27 100644 --- a/pgscatalog_utils/validate/schemas.py +++ b/pgscatalog_utils/validate/schemas.py @@ -1,4 +1,3 @@ -import sys import numpy as np from pandas_schema import Column from pandas_schema.validation import MatchesPatternValidation, InListValidation, CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation, CustomElementValidation diff --git a/pgscatalog_utils/validate/validate_scorefile.py b/pgscatalog_utils/validate/validate_scorefile.py index 3e38bf4..80294c3 100644 --- a/pgscatalog_utils/validate/validate_scorefile.py +++ b/pgscatalog_utils/validate/validate_scorefile.py @@ -1,6 +1,7 @@ import os, glob, re import argparse import logging +import textwrap data_sum = {'valid': [], 'invalid': [], 'other': []} @@ -8,6 +9,55 @@ logging.basicConfig(level=logging.INFO, format='(%(levelname)s): %(message)s') + +def validate_scorefile() -> None: + global data_sum, score_dir + args = _parse_args() + _check_args(args) + + # Check PGS Catalog file name nomenclature + check_filename = False + if args.check_filename: + check_filename = True + else: + print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.") + + validator_type = args.t + files_dir = args.dir + log_dir = args.log_dir + + ## Select validator class ## + if validator_type == 'formatted': + import pgscatalog_utils.validate.formatted.validator as validator_package + elif validator_type == 'hm_pos': + import pgscatalog_utils.validate.harmonized_position.validator as validator_package + + ## Run validator ## + # One file + if args.f: + _run_validator(args.f,log_dir,score_dir,validator_package,check_filename,validator_type) + # Content of the directory + elif files_dir: + count_files = 0 + # Browse directory: for each file run validator + for filepath in sorted(glob.glob(files_dir+"/*.*")): + _run_validator(filepath,log_dir,score_dir,validator_package,check_filename,validator_type) + count_files += 1 + + # Print summary + results + print("\nSummary:") + if data_sum['valid']: + print(f"- Valid: {len(data_sum['valid'])}/{count_files}") + if data_sum['invalid']: + print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}") + if data_sum['other']: + print(f"- Other issues: {len(data_sum['other'])}/{count_files}") + + if data_sum['invalid']: + print("Invalid files:") + print("\n".join(data_sum['invalid'])) + + def _read_last_line(file: str) -> str: ''' Return the last line of the file @@ -36,47 +86,7 @@ def _file_validation_state(filename: str, log_file: str) -> None: data_sum['other'].append(filename) -def _run_validator(validator: object, file: str, check_filename: bool, logfile: str, validator_type: str) -> None: - ''' Main method to run the PGS file validator ''' - if check_filename: - validator.run_validator() - else: - validator.run_validator_skip_check_filename() - # validator.logger.propagate = False - - # # Check files exist - # if not file or not logfile: - # validator.logger.info("Missing file and/or logfile") - # validator.set_file_is_invalid() - # elif file and not os.path.exists(file): - # validator.logger.info("Error: the file '"+file+"' can't be found") - # validator.set_file_is_invalid() - - # # Validate file extension - # validator.validate_file_extension() - - # # Validate file name nomenclature - # if validator.is_file_valid() and check_filename: - # validator.validate_filename() - - # # Only for harmonized files - # if validator.is_file_valid() and validator_type != 'formatted': - # validator.compare_with_filename() - - # # Validate column headers - # if validator.is_file_valid(): - # validator.validate_headers() - - # # Validate data content - # if validator.is_file_valid(): - # validator.validate_data() - - # # Close log handler - # validator.logger.removeHandler(validator.handler) - # validator.handler.close() - - -def _check_args(args): +def _check_args(args: argparse.Namespace) -> None: global score_dir ## Check parameters ## @@ -112,79 +122,50 @@ def _check_args(args): print("WARNING: the parameter '--score_dir' is not present in the submitted command line, therefore the comparison of the number of data rows between the formatted scoring file(s) and the harmonized scoring file(s) won't be performed.") -def validate_file(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None: +def _run_validator(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None: ''' Run the file validator ''' file = os.path.basename(filepath) filename = file.split('.')[0] print(f"# Filename: {file}") - log_file = log_dir+'/'+filename+'_log.txt' + log_file = f'{log_dir}/{filename}_log.txt' # Run validator validator = validator_package.init_validator(filepath,log_file,score_dir) - _run_validator(validator,filepath,check_filename,log_file,validator_type) + if check_filename: + validator.run_validator() + else: + validator.run_validator_skip_check_filename() # Check log _file_validation_state(file,log_file) -def main(): - global data_sum, score_dir +def _description_text() -> str: + return textwrap.dedent('''\ + Validate a set of scoring files to match the PGS Catalog scoring file formats. + It can validate: + - The formatted scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring) + - The harmonized (Position) scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring_hm_pos) + ''') - argparser = argparse.ArgumentParser() - argparser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE') - argparser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME') - argparser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option') - argparser.add_argument('--score_dir', help=' The name of the directory containing the formatted scoring files to compare with harmonized scoring files') - argparser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True) - argparser.add_argument('--check_filename', help=' Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true') - - args = argparser.parse_args() - - ## Check parameters ## - _check_args(args) - # Check PGS Catalog file name nomenclature - check_filename = False - if args.check_filename: - check_filename = True - else: - print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.") +def _epilog_text() -> str: + return textwrap.dedent(f'''\ + You need to specify the type of file format to validate, using the paramter '-t' ({' or '.join(val_types)}). + ''') - validator_type = args.t - files_dir = args.dir - log_dir = args.log_dir +def _parse_args(args=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(), + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE') + parser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME') + parser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option') + parser.add_argument('--score_dir', help=' The name of the directory containing the formatted scoring files to compare with harmonized scoring files') + parser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True) + parser.add_argument('--check_filename', help=' Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true') + return parser.parse_args(args) - ## Select validator class ## - if validator_type == 'formatted': - import pgscatalog_utils.validate.formatted.validator as validator_package - elif validator_type == 'hm_pos': - import pgscatalog_utils.validate.harmonized_position.validator as validator_package - - ## Run validator ## - # One file - if args.f: - validate_file(args.f,log_dir,score_dir,validator_package,check_filename,validator_type) - # Content of the directory - elif files_dir: - count_files = 0 - # Browse directory: for each file run validator - for filepath in sorted(glob.glob(files_dir+"/*.*")): - validate_file(filepath,log_dir,score_dir,validator_package,check_filename,validator_type) - count_files += 1 - - # Print summary + results - print("\nSummary:") - if data_sum['valid']: - print(f"- Valid: {len(data_sum['valid'])}/{count_files}") - if data_sum['invalid']: - print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}") - if data_sum['other']: - print(f"- Other issues: {len(data_sum['other'])}/{count_files}") - - if data_sum['invalid']: - print("Invalid files:") - print("\n".join(data_sum['invalid'])) if __name__ == '__main__': - main() + validate_scorefile() diff --git a/pgscatalog_utils/validate/validator_base.py b/pgscatalog_utils/validate/validator_base.py index 80af5c4..ddfbc59 100644 --- a/pgscatalog_utils/validate/validator_base.py +++ b/pgscatalog_utils/validate/validator_base.py @@ -7,10 +7,8 @@ from typing import List import pandas as pd import pandas_schema -from pgscatalog_utils.validate.schemas import * import warnings - -warnings.filterwarnings('ignore', category=UserWarning, module='pandas_schema') +from pgscatalog_utils.validate.schemas import * ''' PGS Catalog file validator @@ -23,7 +21,7 @@ class ValidatorBase: valid_extensions = VALID_FILE_EXTENSIONS - validators = GENERIC_VALIDATORS + schema_validators = GENERIC_VALIDATORS valid_cols = [] valid_type = '' sep = '\t' @@ -55,6 +53,18 @@ def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): self.variants_number = 0 + def validate_schema(self, schema: dict, dataframe_to_validate: pd.core.frame.DataFrame): + ''' + Run the pandas_schema validation using the provided Schema and DataFrame + ''' + self.schema = pandas_schema.Schema([schema[h] for h in self.cols_to_validate]) + with warnings.catch_warnings(): + # Ignore python warningd raised in the pandas_schema code + warnings.simplefilter('ignore', UserWarning) + errors = self.schema.validate(dataframe_to_validate) + self.store_errors(errors) + + def setup_field_validation(self): ''' Fetch the header and build the list of column to check/validate @@ -146,13 +156,11 @@ def validate_data(self) -> bool: # Validate data content and check the consitence between the declared variants number and the actual number of variants in the file self.validate_content() for chunk in self.df_iterator(self.file): - to_validate = chunk[self.cols_to_read] - to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded + dataframe_to_validate = chunk[self.cols_to_read] + dataframe_to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded # Schema validation - self.schema = pandas_schema.Schema([self.validators[h] for h in self.cols_to_validate]) - errors = self.schema.validate(to_validate) - self.store_errors(errors) + self.validate_schema(self.schema_validators,dataframe_to_validate) self.process_errors() if len(self.bad_rows) >= self.error_limit: diff --git a/poetry.lock b/poetry.lock index e920a73..c3a2742 100644 --- a/poetry.lock +++ b/poetry.lock @@ -120,6 +120,19 @@ pytz = ">=2020.1" [package.extras] test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +[[package]] +name = "pandas-schema" +version = "0.3.6" +description = "A validation library for Pandas data frames using user-friendly schemas" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +numpy = "*" +packaging = "*" +pandas = ">=0.19" + [[package]] name = "pluggy" version = "1.0.0" diff --git a/pyproject.toml b/pyproject.toml index 44ef233..e23d84b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,10 +11,13 @@ readme = "README.md" combine_scorefiles = "pgscatalog_utils.scorefile.combine_scorefiles:combine_scorefiles" download_scorefiles = "pgscatalog_utils.download.download_scorefile:download_scorefile" match_variants = "pgscatalog_utils.match.match_variants:match_variants" +validate_scorefiles = "pgscatalog_utils.validate.validate_scorefile:validate_scorefile" [tool.poetry.dependencies] python = "^3.10" +numpy = "^1.23.3" pandas = "^1.4.3" +pandas-schema = "^0.3.6" pyliftover = "^0.4" requests = "^2.28.1" jq = "^1.2.2" From 60e3d9f174bb0b501037d0da688607a067cb83eb Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Thu, 22 Sep 2022 14:43:00 +0100 Subject: [PATCH 16/46] Attempt to fix poetry error --- poetry.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/poetry.lock b/poetry.lock index c3a2742..7f2a58e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -344,6 +344,7 @@ pandas = [ {file = "pandas-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e"}, {file = "pandas-1.4.3.tar.gz", hash = "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c"}, ] +pandas-schema = [] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, From f6d727f94e90d8f17bac3d9e311a5cea5d7f42de Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Thu, 22 Sep 2022 14:52:35 +0100 Subject: [PATCH 17/46] Fix version discrepancies for numpy --- poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 7f2a58e..0cebbc4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -87,7 +87,7 @@ python-versions = ">=3.5" [[package]] name = "numpy" -version = "1.23.1" +version = "1.23.3" description = "NumPy is the fundamental package for array computing with Python." category = "main" optional = false From 13216fed7dd81109d2a24b60ec0abd8b77b33873 Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Thu, 22 Sep 2022 15:02:22 +0100 Subject: [PATCH 18/46] Fix path to the 'validate' test data files --- tests/test_validate.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index e00448e..7459f05 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -6,6 +6,7 @@ log_file = 'VALIDATE.log' +test_data_dir = './tests/data' ###### Formatted scoring files ###### @@ -109,40 +110,40 @@ def test_valid_formatted_file_pos_only_38(test_hmpos_file_GRCh38_3): @pytest.fixture def test_file_1(): - return './data/test_scoring_file_1.txt.gz' + return f'{test_data_dir}/test_scoring_file_1.txt.gz' @pytest.fixture def test_file_2(): - return './data/test_scoring_file_2.txt.gz' + return f'{test_data_dir}/test_scoring_file_2.txt.gz' @pytest.fixture def test_file_3(): - return './data/test_scoring_file_3.txt.gz' + return f'{test_data_dir}/test_scoring_file_3.txt.gz' @pytest.fixture def test_file_4(): - return './data/test_scoring_file_4.txt.gz' + return f'{test_data_dir}/test_scoring_file_4.txt.gz' @pytest.fixture def test_hmpos_file_GRCh37_1(): - return './data/test_scoring_file_hmpos_37_1.txt.gz' + return f'{test_data_dir}/test_scoring_file_hmpos_37_1.txt.gz' @pytest.fixture def test_hmpos_file_GRCh38_1(): - return './data/test_scoring_file_hmpos_38_1.txt.gz' + return f'{test_data_dir}/test_scoring_file_hmpos_38_1.txt.gz' @pytest.fixture def test_hmpos_file_GRCh37_2(): - return './data/test_scoring_file_hmpos_37_2.txt.gz' + return f'{test_data_dir}/test_scoring_file_hmpos_37_2.txt.gz' @pytest.fixture def test_hmpos_file_GRCh38_2(): - return './data/test_scoring_file_hmpos_38_2.txt.gz' + return f'{test_data_dir}/test_scoring_file_hmpos_38_2.txt.gz' @pytest.fixture def test_hmpos_file_GRCh37_3(): - return './data/test_scoring_file_hmpos_37_3.txt.gz' + return f'{test_data_dir}/test_scoring_file_hmpos_37_3.txt.gz' @pytest.fixture def test_hmpos_file_GRCh38_3(): - return './data/test_scoring_file_hmpos_38_3.txt.gz' \ No newline at end of file + return f'{test_data_dir}/test_scoring_file_hmpos_38_3.txt.gz' \ No newline at end of file From b60b0aa32b9a9ac459ca21f3e5153afbc7a85048 Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Fri, 23 Sep 2022 09:54:28 +0100 Subject: [PATCH 19/46] Update the README file --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index d19c186..e0126ec 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ other users might find some of these tools helpful. in 'long' format * `match_variants`: Match target variants (bim or pvar files) against the output of `combine_scorefile` to produce scoring files for plink 2 +* `validate_scorefiles`: Check/validate that the scoring files and harmonized scoring files match the PGS Catalog scoring file formats. ## Installation @@ -26,6 +27,7 @@ $ pip install pgscatalog-utils $ download_scorefiles -i PGS000922 PGS001229 -o . -b GRCh37 $ combine_scorefiles -s PGS*.txt.gz -o combined.txt $ match_variants -s combined.txt -t --min_overlap 0.75 --outdir . +$ validate_scorefiles -t formatted --dir --log_dir ``` More details are available using the `--help` parameter. From 1881c17ddf978230d9eb1b84a33dc271c2da11e4 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 27 Sep 2022 11:30:31 +0100 Subject: [PATCH 20/46] fix df truthiness --- pgscatalog_utils/match/match.py | 8 ++++---- pgscatalog_utils/match/preprocess.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 677f22a..e0347b2 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -18,7 +18,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bo 'accession', 'effect_allele_FLIP', 'other_allele_FLIP', 'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type'] - if scorefile_oa: + if not scorefile_oa.is_empty(): logger.debug("Getting matches for scores with effect allele and other allele") matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt").select(col_order)) matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref").select(col_order)) @@ -26,7 +26,7 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bo matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip").select(col_order)) matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip").select(col_order)) - if scorefile_no_oa: + if not scorefile_no_oa.is_empty(): logger.debug("Getting matches for scores with effect allele only") matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref").select(col_order)) matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt").select(col_order)) @@ -92,7 +92,7 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: """ Casting important columns to categorical makes polars fast """ - if scorefile: + if not scorefile.is_empty(): scorefile = scorefile.with_columns([ pl.col("effect_allele").cast(pl.Categorical), pl.col("other_allele").cast(pl.Categorical), @@ -101,7 +101,7 @@ def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: pl.col("other_allele_FLIP").cast(pl.Categorical), pl.col("accession").cast(pl.Categorical) ]) - if target: + if not target.is_empty(): target = target.with_columns([ pl.col("ID").cast(pl.Categorical), pl.col("REF").cast(pl.Categorical), diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 4d93090..206466f 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -35,14 +35,14 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format .otherwise(pl.lit(False)) .alias('is_multiallelic'))) - if df.select('is_multiallelic').sum() > 0: + if df.get_column('is_multiallelic').sum() > 0: logger.debug("Multiallelic variants detected") if remove_multiallelic: if file_format == "bim": logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic " "variant representations only") logger.debug('Dropping multiallelic variants') - return df.filter(~df['is_multiallelic']) + return df.filter(~df.get_column('is_multiallelic')) else: logger.debug("Exploding dataframe to handle multiallelic variants") df.replace('ALT', df['ALT'].str.split(by=',')) # turn ALT to list of variants From 3bf62c398071e7737514fbedfd459d475259600e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 27 Sep 2022 11:32:39 +0100 Subject: [PATCH 21/46] fix bumped version --- Dockerfile | 4 ++-- pgscatalog_utils/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8c19690..0d42228 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,8 @@ FROM python:3.10 WORKDIR /opt/ -COPY --from=builder /app/dist/pgscatalog_utils-0.1.2-py3-none-any.whl . +COPY --from=builder /app/dist/pgscatalog_utils-0.2.0-py3-none-any.whl . -RUN pip install pgscatalog_utils-0.1.2-py3-none-any.whl +RUN pip install pgscatalog_utils-0.2.0-py3-none-any.whl RUN apt-get update && apt-get install -y sqlite3 \ No newline at end of file diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py index 8ce9b36..7fd229a 100644 --- a/pgscatalog_utils/__init__.py +++ b/pgscatalog_utils/__init__.py @@ -1 +1 @@ -__version__ = '0.1.3' +__version__ = '0.2.0' diff --git a/pyproject.toml b/pyproject.toml index 9c7bbb5..e362cb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pgscatalog_utils" -version = "0.1.3" +version = "0.2.0" description = "Utilities for working with PGS Catalog API and scoring files" homepage = "https://github.com/PGScatalog/pgscatalog_utils" authors = ["Benjamin Wingfield ", "Samuel Lambert "] From f02c58c6b9af287521f68638bb82f4df5590d6c5 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 27 Sep 2022 11:36:27 +0100 Subject: [PATCH 22/46] batch process input to reduce memory usage --- pgscatalog_utils/target.py | 119 ++++++++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 14 deletions(-) diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index 6b28998..22204f8 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -33,22 +33,113 @@ def from_path(cls, path): return cls(file_format=file_format, path=path, header=header, compressed=compressed) - def read(self) -> pl.DataFrame: - """ Read variant information into a polars df (expensive operation). Automatically handle compressed data. """ - # column_1 is always CHROM, which must always be a string or X/Y/MT/PAR will break inferred dtypes - logger.debug("Reading target into memory") - chrom_dtype = {'column_1': str} + # @profile + def read(self): + # this function is responsible for returning dfs allocated to contiguous memory, so manually rechunk if self.compressed: - with open(self.path, 'rb') as f: - dctx = zstandard.ZstdDecompressor() - with dctx.stream_reader(f) as reader: - df: pl.DataFrame = pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', dtype=chrom_dtype) - df.columns = self.header - return df.select(_default_cols()) + return self._read_compressed_chunks().rechunk() else: - df: pl.DataFrame = pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', dtype=chrom_dtype) - df.columns = self.header - return df.select(_default_cols()) + batch_size = 10000000 + n_rows_read = 0 + df_lst = [] + while True: + df_lst.append(self._read_batch(batch_size=batch_size, n_skip=n_rows_read)) + n_rows_read = n_rows_read + batch_size + + if df_lst[-1].shape[0] < batch_size: + logger.debug("Finished reading final batch") + break + + return pl.concat(df_lst, rechunk=True) + + def _read_batch(self, batch_size, n_skip): + logger.debug(f"{n_skip} target variants read, reading next batch") + assert not self.compressed + # TODO: lazy frame it + logger.debug("Reading uncompressed data") + return pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', n_threads=1, + dtype=_get_col_dtypes(self.file_format), + columns=_get_default_col_idx(self.file_format), + new_columns=_default_cols(), + rechunk=False, + n_rows=batch_size, + skip_rows_after_header=n_skip) + + def _read_compressed_chunks(self): + logger.debug("Reading zstd compressed data") + df_lst = [] + dtypes = _get_col_dtypes(self.file_format) + columns = _get_default_col_idx(self.file_format) + new_col_names = _default_cols() + + with open(self.path, 'rb') as fh: + dctx = zstandard.ZstdDecompressor() + chunk_buffer = b'' + + # don't decode bytes stream to utf-8 with TextIOWrapper in python, polars + rust will be faster + for chunk in dctx.read_to_iter(fh, read_size=int(1e+8)): # read 100MB of compressed data per chunk + if not chunk: + break + + end = chunk.rfind(b'\n') + 1 # only want to read complete rows + if chunk_buffer: + row_chunk = b''.join([chunk_buffer, chunk[:end]]) + chunk_buffer = b'' + else: + row_chunk = chunk[:end] + + df = pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#', n_threads=1, + dtype=dtypes, + columns=columns, + new_columns=new_col_names, + rechunk=False) + df_lst.append(df) + chunk_buffer = b''.join([chunk_buffer, chunk[end:]]) + + return pl.concat(df_lst, rechunk=False) + + +def _get_default_col_idx(file_format): + # import default columns: + # ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] + match file_format: + case 'bim': + return [0, 1, 3, 4, 5] # see _get_col_dtypes, dropping centimorgans + case 'pvar': + return [0, 1, 2, 3, 4] # dropping QUAL FILTER INFO etc + case _: + logger.critical("Trying to get column idx for an invalid file format, TWENTY THREE NINETEEN") + raise Exception + + +def _get_col_dtypes(file_format): + """ Manually set up categorical dtypes """ + match file_format: + case 'bim': + # 1. Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name + # 2. Variant identifier + # 3. Position in morgans or centimorgans (safe to use dummy value of '0') + # 4. Base-pair coordinate (1-based; limited to 231-2) + # 5. Allele 1 (corresponding to clear bits in .bed; usually minor) + # 6. Allele 2 (corresponding to set bits in .bed; usually major) + d = {'column_1': pl.Categorical, 'column_2': str, 'column_3': pl.Float64, 'column_4': pl.UInt64, + 'column_5': pl.Categorical, 'column_6': pl.Categorical} + case 'pvar': + # 1. CHROM + # 2. POS (base-pair coordinate) + # 3. ID (variant ID; required) + # 4. REF (reference allele) + # 5. ALT (alternate alleles, comma-separated) + # 6. QUAL (phred-scaled quality score for whether the locus is variable at all) + # 7. FILTER ('PASS', '.', or semicolon-separated list of failing filter codes) + # 8. INFO (semicolon-separated list of flags and key-value pairs, with types declared in header) + d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Utf8, 'column_4': pl.Categorical, + 'column_5': pl.Utf8, 'column_6': pl.Float32, 'column_7': pl.Utf8, 'column_8': pl.Utf8} + # can't cast ALT to cat yet, because of multiallelic variants! + case _: + logger.critical("Trying to set header dtypes for an invalid file format, time to explode") + raise Exception + return d def _get_header(fh) -> tuple[str, list[str]]: From ae8ce1446d9aacdea0f46fc8521ace5a92f0abf1 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 27 Sep 2022 16:41:59 +0100 Subject: [PATCH 23/46] read uncompressed data with a bufferedreader --- pgscatalog_utils/target.py | 98 ++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index 22204f8..ee62074 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -11,7 +11,6 @@ class Target: """ Class to detect and read a plink1/plink2 variant information file """ file_format: str = None - header: list[str] = None path: str = None compressed: bool = False @@ -20,7 +19,7 @@ def from_path(cls, path): """ Create a Target object from a path. Cheaply detect file format and headers. """ try: with open(path, 'r') as f: - file_format, header = _get_header(f) + file_format = _get_format(f) compressed = False except UnicodeDecodeError: logger.error("Can't open target as a text file, so trying to read zstd compressed binary file") @@ -28,44 +27,61 @@ def from_path(cls, path): dctx = zstandard.ZstdDecompressor() stream_reader = dctx.stream_reader(f) text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8') - file_format, header = _get_header(text_stream) + file_format = _get_format(text_stream) compressed = True - return cls(file_format=file_format, path=path, header=header, compressed=compressed) + return cls(file_format=file_format, path=path, compressed=compressed) - # @profile + #@profile def read(self): - # this function is responsible for returning dfs allocated to contiguous memory, so manually rechunk if self.compressed: - return self._read_compressed_chunks().rechunk() + df = self._read_compressed_chunks().rechunk().lazy() + return _filter_target(df) else: - batch_size = 10000000 - n_rows_read = 0 - df_lst = [] + df = self._read_uncompressed_chunks().rechunk().lazy() + return _filter_target(df) + + def _read_uncompressed_chunks(self): + """ Read a CSV using a BufferedIOReader. This is a bit slower than pl.read_csv() (30s vs 5s). + + Lots of testing showed that lazy scanning and native polars reading used a lot of RAM, then freed a bunch. + Plotting RAM usage against time looked like a spiky hedgehog. + + This function linearly consumes RAM in a more linear way by: + 1. Reading a batch of lines + 2. Dropping unused columns + 3. Setting categorical dtypes on read + 4. Don't rechunk until later + """ + logger.debug("Reading uncompressed chunks") + + df_lst = [] + dtypes = _get_col_dtypes(self.file_format) + col_idxs = _get_default_col_idx(self.file_format) + new_col_names = _default_cols() + + with open(self.path, "rb") as f: while True: - df_lst.append(self._read_batch(batch_size=batch_size, n_skip=n_rows_read)) - n_rows_read = n_rows_read + batch_size + buffer = b''.join(f.readlines(int(1e6))) - if df_lst[-1].shape[0] < batch_size: - logger.debug("Finished reading final batch") + if not buffer: break - return pl.concat(df_lst, rechunk=True) - - def _read_batch(self, batch_size, n_skip): - logger.debug(f"{n_skip} target variants read, reading next batch") - assert not self.compressed - # TODO: lazy frame it - logger.debug("Reading uncompressed data") - return pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', n_threads=1, - dtype=_get_col_dtypes(self.file_format), - columns=_get_default_col_idx(self.file_format), - new_columns=_default_cols(), - rechunk=False, - n_rows=batch_size, - skip_rows_after_header=n_skip) + df = (pl.read_csv(buffer, sep='\t', has_header=False, comment_char='#', n_threads=1, + dtype=dtypes, + columns=col_idxs, + new_columns=new_col_names, + rechunk=False)) + + df_lst.append(df) + + return pl.concat(df_lst, rechunk=False) def _read_compressed_chunks(self): + """ Like _read_uncompressed_chunks, but read chunks of bytes and handle incomplete rows + + zstd returns chunks of bytes, not lines, but encoding utf-8 will be faster in rust and polars + """ logger.debug("Reading zstd compressed data") df_lst = [] dtypes = _get_col_dtypes(self.file_format) @@ -76,7 +92,6 @@ def _read_compressed_chunks(self): dctx = zstandard.ZstdDecompressor() chunk_buffer = b'' - # don't decode bytes stream to utf-8 with TextIOWrapper in python, polars + rust will be faster for chunk in dctx.read_to_iter(fh, read_size=int(1e+8)): # read 100MB of compressed data per chunk if not chunk: break @@ -113,7 +128,7 @@ def _get_default_col_idx(file_format): def _get_col_dtypes(file_format): - """ Manually set up categorical dtypes """ + """ Manually set up dtypes. pl.Categorical saves a lot of RAM vs pl.Utf8 """ match file_format: case 'bim': # 1. Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name @@ -142,36 +157,25 @@ def _get_col_dtypes(file_format): return d -def _get_header(fh) -> tuple[str, list[str]]: - header = None +def _get_format(fh) -> str: file_format = None - logger.debug(f"Scanning header to get file format and column names") + logger.debug(f"Scanning header to get file format") for line in fh: if line.startswith('#'): logger.debug("pvar format detected") file_format = 'pvar' - header = _pvar_header(fh) break else: logger.debug("bim format detected") file_format = 'bim' - header = _bim_header() break - return file_format, header - + return file_format -def _pvar_header(fh) -> list[str]: - """ Get the column names from the pvar file (not constrained like bim, especially when converted from VCF) """ - line: str = '#' - while line.startswith('#'): - line: str = fh.readline() - if line.startswith('#CHROM'): - return line.strip().split('\t') - -def _bim_header() -> list[str]: - return ['#CHROM', 'ID', 'CM', 'POS', 'REF', 'ALT'] +def _default_cols() -> list[str]: + """ Standardise column names in a target genome """ + return ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] def _default_cols() -> list[str]: From 3bb3e3d89a05302d58a83bdee76bc02dec04f4a5 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 28 Sep 2022 17:15:58 +0100 Subject: [PATCH 24/46] add lazy evaluation --- pgscatalog_utils/match/filter.py | 16 +++---- pgscatalog_utils/match/label.py | 2 +- pgscatalog_utils/match/log.py | 24 +++++------ pgscatalog_utils/match/match.py | 55 ++++++++---------------- pgscatalog_utils/match/match_variants.py | 4 +- pgscatalog_utils/match/preprocess.py | 27 +++++++++--- pgscatalog_utils/match/read.py | 31 +++++++++---- pgscatalog_utils/match/write.py | 8 ++-- pgscatalog_utils/target.py | 21 +++++---- 9 files changed, 97 insertions(+), 91 deletions(-) diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py index c47a449..c2d0364 100644 --- a/pgscatalog_utils/match/filter.py +++ b/pgscatalog_utils/match/filter.py @@ -5,14 +5,14 @@ logger = logging.getLogger(__name__) -def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: float, - dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]: +def filter_scores(scorefile: pl.LazyFrame, matches: pl.LazyFrame, min_overlap: float, + dataset: str) -> tuple[pl.LazyFrame, pl.LazyFrame]: """ Check overlap between filtered matches and scorefile, remove scores that don't match well and report stats """ - filtered_matches: pl.DataFrame = _filter_matches(matches) - match_log: pl.DataFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset) + filtered_matches: pl.LazyFrame = _filter_matches(matches) + match_log: pl.LazyFrame = (_join_filtered_matches(filtered_matches, scorefile, dataset) .with_columns(pl.col('best_match').fill_null(False))) - fail_rates: pl.DataFrame = _calculate_match_rate(match_log) + fail_rates: pl.DataFrame = _calculate_match_rate(match_log).collect() # collect for iteration scores: list[pl.DataFrame] = [] for accession, rate in zip(fail_rates['accession'].to_list(), fail_rates['fail_rate'].to_list()): @@ -25,7 +25,7 @@ def filter_scores(scorefile: pl.DataFrame, matches: pl.DataFrame, min_overlap: f logger.error(f"Score {accession} fails minimum matching threshold ({1 - rate:.2%} variants match)") scores.append(df.with_column(pl.col('accession').cast(pl.Categorical))) - score_summary: pl.DataFrame = pl.concat(scores) + score_summary: pl.LazyFrame = pl.concat(scores).lazy() filtered_scores: pl.DataFrame = (filtered_matches.join(score_summary, on='accession', how='left') .filter(pl.col('score_pass') == True)) @@ -39,12 +39,12 @@ def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame: .with_column((pl.col('no_match') / pl.col('count')).alias('fail_rate'))) -def _filter_matches(df: pl.DataFrame) -> pl.DataFrame: +def _filter_matches(df: pl.LazyFrame) -> pl.LazyFrame: logger.debug("Filtering variants with exclude flag") return df.filter((pl.col('best_match') == True) & (pl.col('exclude') == False)) -def _join_filtered_matches(matches: pl.DataFrame, scorefile: pl.DataFrame, dataset: str) -> pl.DataFrame: +def _join_filtered_matches(matches: pl.LazyFrame, scorefile: pl.LazyFrame, dataset: str) -> pl.LazyFrame: return (scorefile.join(matches, on=['row_nr', 'accession'], how='left') .with_column(pl.lit(dataset).alias('dataset')) .select(pl.exclude("^.*_right$"))) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 0d38ccb..ad7423c 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -60,7 +60,7 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: .then(pl.lit(True)) .otherwise(pl.lit(False)) .alias('best_match'))) - assert prioritised.shape[0] == df.shape[0] # I'm watching you, Wazowski. Always watching. Always. + return prioritised.drop(['match_priority', 'best_match_type']) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 91f3999..3b4686c 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -5,7 +5,7 @@ logger = logging.getLogger(__name__) -def make_logs(scorefile, match_candidates, filter_summary, dataset): +def make_logs(scorefile: pl.LazyFrame, match_candidates: pl.LazyFrame, filter_summary: pl.LazyFrame, dataset: str): # summary log -> aggregated from best matches (one per scoring file line) # big log -> unaggregated, written to compressed gzip, possibly multiple matches per scoring file line summary_log, big_log = _join_match_candidates(scorefile=scorefile, matches=match_candidates, @@ -13,29 +13,29 @@ def make_logs(scorefile, match_candidates, filter_summary, dataset): dataset=dataset) # make sure the aggregated best log matches the scoring file accession line count - summary_count = (summary_log.groupby(pl.col('accession')) + summary_count: pl.LazyFrame = (summary_log.groupby(pl.col('accession')) .agg(pl.sum('count'))) - log_count = (scorefile.groupby("accession") - .count() - .join(summary_count, on='accession')) + log_count: pl.DataFrame = (scorefile.groupby("accession") + .agg(pl.count()) + .join(summary_count, on='accession')).collect() - assert (log_count['count'] == log_count['count_right']).all(), "Log doesn't match input scoring file" + assert (log_count.get_column('count') == log_count.get_column('count_right')).all(), "Log doesn't match input scoring file" logger.debug("Log matches input scoring file") return _prettify_log(big_log), _prettify_summary(summary_log) -def make_summary_log(best_matches, filter_summary): +def make_summary_log(best_matches: pl.LazyFrame, filter_summary: pl.LazyFrame) -> pl.LazyFrame: """ Make an aggregated table """ logger.debug("Aggregating best match log into a summary table") return (best_matches .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate_best_match', 'duplicate_ID']) - .count() + .agg(pl.count()) .join(filter_summary, how='left', on='accession')) -def _prettify_summary(df: pl.DataFrame): +def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame: keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", "duplicate_best_match", "duplicate_ID", "count", "percent"] return (df.with_column((pl.col("count") / pl.sum("count") * 100) @@ -44,7 +44,7 @@ def _prettify_summary(df: pl.DataFrame): .select(keep_cols)) -def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: +def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame: keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"] @@ -54,8 +54,8 @@ def _prettify_log(df: pl.DataFrame) -> pl.DataFrame: return pretty_df -def _join_match_candidates(scorefile: pl.DataFrame, matches: pl.DataFrame, filter_summary: pl.DataFrame, - dataset: str) -> tuple[pl.DataFrame, pl.DataFrame]: +def _join_match_candidates(scorefile: pl.LazyFrame, matches: pl.LazyFrame, filter_summary: pl.LazyFrame, + dataset: str) -> tuple[pl.LazyFrame, pl.LazyFrame]: """ Join match candidates against the original scoring file """ logger.debug("Making big logs") diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index e0347b2..aedf941 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -9,32 +9,31 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: - scorefile_cat, target_cat = _cast_categorical(scorefile, target) - scorefile_oa = scorefile_cat.filter(pl.col("other_allele") != None) - scorefile_no_oa = scorefile_cat.filter(pl.col("other_allele") == None) + scorefile_oa = scorefile.filter(pl.col("other_allele") != None) + scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None) matches: list[pl.DataFrame] = [] col_order = ['row_nr', 'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession', 'effect_allele_FLIP', 'other_allele_FLIP', 'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type'] - if not scorefile_oa.is_empty(): - logger.debug("Getting matches for scores with effect allele and other allele") - matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt").select(col_order)) - matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref").select(col_order)) - if skip_flip is False: - matches.append(_match_variants(scorefile_cat, target_cat, match_type="refalt_flip").select(col_order)) - matches.append(_match_variants(scorefile_cat, target_cat, match_type="altref_flip").select(col_order)) + logger.debug("Getting matches for scores with effect allele and other allele") + matches.append(_match_variants(scorefile=scorefile_oa, target=target, match_type="refalt").select(col_order)) + matches.append(_match_variants(scorefile_oa, target, match_type="altref").select(col_order)) + if skip_flip is False: + matches.append(_match_variants(scorefile_oa, target, match_type="refalt_flip").select(col_order)) + matches.append(_match_variants(scorefile_oa, target, match_type="altref_flip").select(col_order)) - if not scorefile_no_oa.is_empty(): - logger.debug("Getting matches for scores with effect allele only") - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref").select(col_order)) - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt").select(col_order)) - if skip_flip is False: - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_ref_flip").select(col_order)) - matches.append(_match_variants(scorefile_no_oa, target_cat, match_type="no_oa_alt_flip").select(col_order)) + logger.debug("Getting matches for scores with effect allele only") + matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref").select(col_order)) + matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt").select(col_order)) + if skip_flip is False: + matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order)) + matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order)) - return pl.concat(matches).pipe(label_matches, remove_ambiguous, keep_first_match) + # manually collect to avoid concat error TODO: try to reproduce and file a bug report + logger.debug("Collecting all matches (parallel)") + return pl.concat(pl.collect_all(matches)).lazy().pipe(label_matches, remove_ambiguous, keep_first_match) def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame: @@ -89,23 +88,3 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s pl.lit(match_type).alias("match_type")]) .join(target.select(join_cols), on="ID", how="inner")) # get REF / ALT back after first join - -def _cast_categorical(scorefile, target) -> tuple[pl.DataFrame, pl.DataFrame]: - """ Casting important columns to categorical makes polars fast """ - if not scorefile.is_empty(): - scorefile = scorefile.with_columns([ - pl.col("effect_allele").cast(pl.Categorical), - pl.col("other_allele").cast(pl.Categorical), - pl.col("effect_type").cast(pl.Categorical), - pl.col("effect_allele_FLIP").cast(pl.Categorical), - pl.col("other_allele_FLIP").cast(pl.Categorical), - pl.col("accession").cast(pl.Categorical) - ]) - if not target.is_empty(): - target = target.with_columns([ - pl.col("ID").cast(pl.Categorical), - pl.col("REF").cast(pl.Categorical), - pl.col("ALT").cast(pl.Categorical) - ]) - - return scorefile, target diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index dd4ec4e..64311ee 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -50,14 +50,14 @@ def match_variants(): valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=matches, dataset=dataset, min_overlap=args.min_overlap) - if valid_matches.is_empty(): # this can happen if args.min_overlap = 0 + if valid_matches.fetch().is_empty(): # this can happen if args.min_overlap = 0 logger.error("Error: no target variants match any variants in scoring files") raise Exception big_log, summary_log = make_logs(scorefile, matches, filter_summary, args.dataset) write_log(big_log, prefix=dataset) - summary_log.write_csv(f"{dataset}_summary.csv") + summary_log.collect().write_csv(f"{dataset}_summary.csv") write_out(valid_matches, args.split, args.outdir, dataset) diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 206466f..d7b1e86 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -2,9 +2,21 @@ import polars as pl +from pgscatalog_utils.target import logger + logger = logging.getLogger(__name__) +def filter_target(df): + """ Remove variants that won't be matched against the scorefile + + Chromosomes 1 - 22, X, and Y with an efficient join. Remmove variants with missing identifiers also + """ + logger.debug("Filtering target to include chromosomes 1 - 22, X, Y") + chroms = [str(x) for x in list(range(1, 23)) + ['X', 'Y']] + return df.filter((pl.col('#CHROM').is_in(chroms)) & (pl.col('ID') != '.')) + + def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataFrame: """ Improved function to complement alleles. Will only complement sequences that are valid DNA. """ @@ -27,7 +39,7 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF return df -def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format: str) -> pl.DataFrame: +def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataFrame: # plink2 pvar multi-alleles are comma-separated df: pl.DataFrame = (df.with_column( pl.when(pl.col("ALT").str.contains(',')) @@ -35,14 +47,15 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format .otherwise(pl.lit(False)) .alias('is_multiallelic'))) - if df.get_column('is_multiallelic').sum() > 0: + multiallelic_canary = (df.filter(pl.col('is_multiallelic') == True) + .limit(1) # just detect the first occurring + .collect()) + + if not multiallelic_canary.is_empty(): logger.debug("Multiallelic variants detected") if remove_multiallelic: - if file_format == "bim": - logger.warning("--remove_multiallelic requested for bim format, which already contains biallelic " - "variant representations only") logger.debug('Dropping multiallelic variants') - return df.filter(~df.get_column('is_multiallelic')) + return df.filter(pl.col('is_multiallelic') == False) else: logger.debug("Exploding dataframe to handle multiallelic variants") df.replace('ALT', df['ALT'].str.split(by=',')) # turn ALT to list of variants @@ -55,3 +68,5 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool, file_format def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame: df.with_column( pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic')) + + diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index c25175a..91e55a6 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -3,7 +3,7 @@ import polars as pl -from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles +from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target from pgscatalog_utils.target import Target logger = logging.getLogger(__name__) @@ -23,18 +23,31 @@ def read_target(path: str, remove_multiallelic: bool) -> pl.DataFrame: dfs: list[pl.DataFrame] = [] for target in targets: assert target.file_format in ['bim', 'pvar'] - dfs.append(target.read().pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic, - file_format=target.file_format)) + dfs.append(target.read()) - return pl.concat(dfs).filter(pl.col("ID") != '.') + logger.debug("Reading all target data complete") + # explicitly rechunk now, because reading is complete and the input data were read unchunked to save memory + # only pipe functions once rechunking has happened to improve speed + # handling multiallelic requires str methods, so don't forget to cast back or matching will break + return (pl.concat(dfs, rechunk=True) + .pipe(filter_target) + .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic) + .with_column(pl.col('ALT').cast(pl.Categorical))) def read_scorefile(path: str) -> pl.DataFrame: logger.debug("Reading scorefile") - scorefile: pl.DataFrame = (pl.read_csv(path, sep='\t', dtype={'chr_name': str}) + dtypes = {'chr_name': pl.Categorical, + 'chr_position': pl.UInt64, + 'effect_allele': pl.Utf8, # str functions required to complement + 'other_allele': pl.Utf8, + 'effect_type': pl.Categorical, + 'accession': pl.Categorical} + return (pl.scan_csv(path, sep='\t', dtype=dtypes) .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele']) .with_columns([ - pl.col('accession').cast(pl.Categorical), - pl.col("effect_type").cast(pl.Categorical)])) - - return scorefile + pl.col("effect_allele").cast(pl.Categorical), + pl.col("other_allele").cast(pl.Categorical), + pl.col("effect_allele_FLIP").cast(pl.Categorical), + pl.col("other_allele_FLIP").cast(pl.Categorical) + ])) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 52253a3..9d4ba92 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -7,18 +7,18 @@ logger = logging.getLogger(__name__) -def write_log(df: pl.DataFrame, prefix: str) -> None: +def write_log(df: pl.LazyFrame, prefix: str) -> None: logger.debug(f"Compressing and writing log: {prefix}_log.csv.gz") with gzip.open(f"{prefix}_log.csv.gz", 'wb') as f: - df.write_csv(f) + df.collect().write_csv(f) -def write_out(df: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: +def write_out(df: pl.LazyFrame, split: bool, outdir: str, dataset: str) -> None: if not os.path.isdir(outdir): os.mkdir(outdir) logger.debug("Splitting by effect type") - effect_types: dict[str, pl.DataFrame] = _split_effect_type(df) + effect_types: dict[str, pl.DataFrame] = _split_effect_type(df.collect()) logger.debug("Deduplicating variants") deduplicated: dict[str, pl.DataFrame] = {k: _deduplicate_variants(k, v) for k, v in effect_types.items()} diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index ee62074..c3fa792 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -35,11 +35,9 @@ def from_path(cls, path): #@profile def read(self): if self.compressed: - df = self._read_compressed_chunks().rechunk().lazy() - return _filter_target(df) + return self._read_compressed_chunks().lazy() else: - df = self._read_uncompressed_chunks().rechunk().lazy() - return _filter_target(df) + return self._read_uncompressed_chunks().lazy() def _read_uncompressed_chunks(self): """ Read a CSV using a BufferedIOReader. This is a bit slower than pl.read_csv() (30s vs 5s). @@ -53,7 +51,7 @@ def _read_uncompressed_chunks(self): 3. Setting categorical dtypes on read 4. Don't rechunk until later """ - logger.debug("Reading uncompressed chunks") + logger.debug("Started reading uncompressed chunks") df_lst = [] dtypes = _get_col_dtypes(self.file_format) @@ -75,6 +73,8 @@ def _read_uncompressed_chunks(self): df_lst.append(df) + logger.debug("Finished reading uncompressed chunks") + logger.debug("Concatenating chunked data frames") return pl.concat(df_lst, rechunk=False) def _read_compressed_chunks(self): @@ -82,7 +82,7 @@ def _read_compressed_chunks(self): zstd returns chunks of bytes, not lines, but encoding utf-8 will be faster in rust and polars """ - logger.debug("Reading zstd compressed data") + logger.debug("Started reading zstd compressed data") df_lst = [] dtypes = _get_col_dtypes(self.file_format) columns = _get_default_col_idx(self.file_format) @@ -111,6 +111,8 @@ def _read_compressed_chunks(self): df_lst.append(df) chunk_buffer = b''.join([chunk_buffer, chunk[end:]]) + logger.debug("Finished reading zstd compressed chunks") + logger.debug("Concatenating chunked data frames") return pl.concat(df_lst, rechunk=False) @@ -137,7 +139,7 @@ def _get_col_dtypes(file_format): # 4. Base-pair coordinate (1-based; limited to 231-2) # 5. Allele 1 (corresponding to clear bits in .bed; usually minor) # 6. Allele 2 (corresponding to set bits in .bed; usually major) - d = {'column_1': pl.Categorical, 'column_2': str, 'column_3': pl.Float64, 'column_4': pl.UInt64, + d = {'column_1': pl.Categorical, 'column_2': pl.Categorical, 'column_3': pl.Float64, 'column_4': pl.UInt64, 'column_5': pl.Categorical, 'column_6': pl.Categorical} case 'pvar': # 1. CHROM @@ -148,7 +150,7 @@ def _get_col_dtypes(file_format): # 6. QUAL (phred-scaled quality score for whether the locus is variable at all) # 7. FILTER ('PASS', '.', or semicolon-separated list of failing filter codes) # 8. INFO (semicolon-separated list of flags and key-value pairs, with types declared in header) - d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Utf8, 'column_4': pl.Categorical, + d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Categorical, 'column_4': pl.Categorical, 'column_5': pl.Utf8, 'column_6': pl.Float32, 'column_7': pl.Utf8, 'column_8': pl.Utf8} # can't cast ALT to cat yet, because of multiallelic variants! case _: @@ -177,6 +179,3 @@ def _default_cols() -> list[str]: """ Standardise column names in a target genome """ return ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] - -def _default_cols() -> list[str]: - return ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] # only columns we want from a target genome From a559d76416e8291f804ef7802663a504dce0279b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 30 Sep 2022 14:14:44 +0100 Subject: [PATCH 25/46] improve RAM usage --- .../aggregate/aggregate_scores.py | 2 +- pgscatalog_utils/{log_config.py => config.py} | 6 + .../download/download_scorefile.py | 2 +- pgscatalog_utils/match/label.py | 2 - pgscatalog_utils/match/log.py | 9 +- pgscatalog_utils/match/match.py | 40 +++- pgscatalog_utils/match/match_variants.py | 69 ++++--- pgscatalog_utils/match/preprocess.py | 4 - pgscatalog_utils/match/read.py | 27 ++- .../scorefile/combine_scorefiles.py | 2 +- pgscatalog_utils/target.py | 176 +++++++++++------- poetry.lock | 77 ++------ 12 files changed, 233 insertions(+), 183 deletions(-) rename pgscatalog_utils/{log_config.py => config.py} (70%) diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py index 2787680..6109a7f 100644 --- a/pgscatalog_utils/aggregate/aggregate_scores.py +++ b/pgscatalog_utils/aggregate/aggregate_scores.py @@ -3,7 +3,7 @@ import pandas as pd -from pgscatalog_utils.log_config import set_logging_level +from pgscatalog_utils.config import set_logging_level import glob import logging diff --git a/pgscatalog_utils/log_config.py b/pgscatalog_utils/config.py similarity index 70% rename from pgscatalog_utils/log_config.py rename to pgscatalog_utils/config.py index dcd9cbe..8bb2a57 100644 --- a/pgscatalog_utils/log_config.py +++ b/pgscatalog_utils/config.py @@ -1,4 +1,10 @@ import logging +import os + +try: + POLARS_MAX_THREADS: int = int(os.getenv('POLARS_MAX_THREADS')) +except TypeError: + POLARS_MAX_THREADS = 1 # not defined, it's better to be slow than set to n_cores (polars default) def set_logging_level(verbose: bool): diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index fc35529..c12467e 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -10,7 +10,7 @@ from pgscatalog_utils.download.publication import query_publication from pgscatalog_utils.download.score import get_url from pgscatalog_utils.download.trait import query_trait -from pgscatalog_utils.log_config import set_logging_level +from pgscatalog_utils.config import set_logging_level logger = logging.getLogger(__name__) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index ad7423c..072fbb1 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -175,5 +175,3 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra .with_column(pl.max(["exclude", "exclude_ambiguous"])) .drop(["exclude", "exclude_ambiguous"]) .rename({"max": "exclude"})) - - diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 3b4686c..ac44084 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -14,12 +14,13 @@ def make_logs(scorefile: pl.LazyFrame, match_candidates: pl.LazyFrame, filter_su # make sure the aggregated best log matches the scoring file accession line count summary_count: pl.LazyFrame = (summary_log.groupby(pl.col('accession')) - .agg(pl.sum('count'))) + .agg(pl.sum('count'))) log_count: pl.DataFrame = (scorefile.groupby("accession") - .agg(pl.count()) - .join(summary_count, on='accession')).collect() + .agg(pl.count()) + .join(summary_count, on='accession')).collect() - assert (log_count.get_column('count') == log_count.get_column('count_right')).all(), "Log doesn't match input scoring file" + assert (log_count.get_column('count') == log_count.get_column( + 'count_right')).all(), "Log doesn't match input scoring file" logger.debug("Log matches input scoring file") return _prettify_log(big_log), _prettify_summary(summary_log) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index aedf941..7a9e0f3 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -1,4 +1,7 @@ +import gc import logging +import os +from tempfile import TemporaryDirectory import polars as pl @@ -7,12 +10,12 @@ logger = logging.getLogger(__name__) -def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bool, remove_ambiguous: bool, - keep_first_match: bool) -> pl.DataFrame: +def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bool, remove_ambiguous: bool, + keep_first_match: bool, low_memory: bool) -> pl.DataFrame: scorefile_oa = scorefile.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None) - matches: list[pl.DataFrame] = [] + matches: list[pl.LazyFrame()] = [] col_order = ['row_nr', 'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'accession', 'effect_allele_FLIP', 'other_allele_FLIP', 'ID', 'REF', 'ALT', 'is_multiallelic', 'matched_effect_allele', 'match_type'] @@ -31,12 +34,34 @@ def get_all_matches(scorefile: pl.DataFrame, target: pl.DataFrame, skip_flip: bo matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order)) matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order)) - # manually collect to avoid concat error TODO: try to reproduce and file a bug report - logger.debug("Collecting all matches (parallel)") - return pl.concat(pl.collect_all(matches)).lazy().pipe(label_matches, remove_ambiguous, keep_first_match) + if low_memory: + logger.debug("Batch collecting matches (low memory mode)") + match_lf = _batch_collect(matches) + else: + logger.debug("Collecting all matches (parallel)") + match_lf = pl.concat(pl.collect_all(matches)) + return match_lf.pipe(label_matches, remove_ambiguous, keep_first_match) -def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: str) -> pl.DataFrame: + +def _batch_collect(matches: list[pl.LazyFrame]): + """ A slower alternative to pl.collect_all(), but this approach will use less peak memory + + This batches the .collect() and writes intermediate results to a temporary working directory + + IPC files are binary and remember column schema. Reading them can be extremely fast. """ + with TemporaryDirectory() as temp_dir: + n_chunks = 0 + for i, match in enumerate(matches): + out_path = os.path.join(temp_dir, str(i) + ".ipc") + match.collect().write_ipc(out_path) + n_chunks += 1 + logger.debug(f"Staged {n_chunks} match chunks to {temp_dir}") + gc.collect() + return pl.read_ipc(os.path.join(temp_dir, "*.ipc")).lazy() + + +def _match_variants(scorefile: pl.LazyFrame, target: pl.LazyFrame, match_type: str) -> pl.LazyFrame: logger.debug(f"Matching strategy: {match_type}") match match_type: case 'refalt': @@ -87,4 +112,3 @@ def _match_variants(scorefile: pl.DataFrame, target: pl.DataFrame, match_type: s pl.col(effect_allele_column).alias("matched_effect_allele"), pl.lit(match_type).alias("match_type")]) .join(target.select(join_cols), on="ID", how="inner")) # get REF / ALT back after first join - diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 64311ee..e85d154 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,11 +1,12 @@ import argparse import logging +import os import textwrap from glob import glob import polars as pl -from pgscatalog_utils.log_config import set_logging_level +from pgscatalog_utils.config import set_logging_level, POLARS_MAX_THREADS from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.log import make_logs from pgscatalog_utils.match.match import get_all_matches @@ -19,29 +20,40 @@ def match_variants(): args = _parse_args() set_logging_level(args.verbose) - - logger.debug(f"polars n_threads: {pl.threadpool_size()}") + logger.debug(f"POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}") + logger.debug(f"polars threadpool size: {pl.threadpool_size()}") + logger.debug(f"Using {POLARS_MAX_THREADS} threads to read CSVs") with pl.StringCache(): - scorefile: pl.DataFrame = read_scorefile(path=args.scorefile) - + scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile) n_target_files = len(glob(args.target)) matches: pl.DataFrame - if n_target_files > 1 and not args.fast: + if n_target_files == 1 and not args.fast: + low_memory: bool = True + match_mode: str = 'single' + elif n_target_files > 1 and not args.fast: + low_memory: bool = True match_mode: str = 'multi' - else: + elif args.fast: + low_memory: bool = False match_mode: str = 'fast' match match_mode: + case "single": + logger.debug(f"Match mode: {match_mode}") # read one target in chunks + matches: pl.LazyFrame = _match_single_target(args.target, scorefile, args.remove_multiallelic, + args.skip_flip, args.remove_ambiguous, + args.keep_first_match, low_memory) case "multi": - logger.debug(f"Match mode: {match_mode}") - matches = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, args.skip_flip, - args.remove_ambiguous, args.keep_first_match) + logger.debug(f"Match mode: {match_mode}") # iterate over multiple targets, in chunks + matches: pl.LazyFrame = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, + args.skip_flip, args.remove_ambiguous, + args.keep_first_match, low_memory) case "fast": - logger.debug(f"Match mode: {match_mode}") - matches = _fast_match(args.target, scorefile, args.remove_multiallelic, args.skip_flip, - args.remove_ambiguous, args.keep_first_match) + logger.debug(f"Match mode: {match_mode}") # just read everything into memory for speed + matches: pl.LazyFrame = _fast_match(args.target, scorefile, args.remove_multiallelic, args.skip_flip, + args.remove_ambiguous, args.keep_first_match, low_memory) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception @@ -61,8 +73,8 @@ def match_variants(): write_out(valid_matches, args.split, args.outdir, dataset) -def _check_target_chroms(target) -> None: - chroms: list[str] = target['#CHROM'].unique().to_list() +def _check_target_chroms(target: pl.LazyFrame) -> None: + chroms: list[str] = target.select(pl.col("#CHROM").unique()).collect().get_column("#CHROM").to_list() if len(chroms) > 1: logger.critical(f"Multiple chromosomes detected: {chroms}. Check input data.") raise Exception @@ -70,25 +82,34 @@ def _check_target_chroms(target) -> None: logger.debug("Split target genome contains one chromosome (good)") -def _fast_match(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: +def _fast_match(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool, + skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, low_memory: bool) -> pl.LazyFrame: # fast match is fast because: # 1) all target files are read into memory # 2) matching occurs without iterating through chromosomes - target: pl.DataFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic) + target: pl.LazyFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic, low_memory=low_memory) logger.debug("Split target chromosomes not checked with fast match mode") - return get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match) + return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy() + + +def _match_single_target(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool, + skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, + low_memory: bool) -> pl.LazyFrame: + target: pl.LazyFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic, low_memory=low_memory) + return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy() -def _match_multiple_targets(target_path: str, scorefile: pl.DataFrame, remove_multiallelic: bool, - skip_filp: bool, remove_ambiguous: bool, keep_first_match: bool) -> pl.DataFrame: +def _match_multiple_targets(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool, + skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, + low_memory: bool) -> pl.LazyFrame: matches = [] for i, loc_target_current in enumerate(glob(target_path)): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') - target: pl.DataFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic) + target: pl.LazyFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic, + low_memory=low_memory) _check_target_chroms(target) - matches.append(get_all_matches(scorefile, target, skip_filp, remove_ambiguous, keep_first_match)) - return pl.concat(matches) + matches.append(get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory)) + return pl.concat(matches).lazy() def _description_text() -> str: diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index d7b1e86..3f0c38d 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -2,8 +2,6 @@ import polars as pl -from pgscatalog_utils.target import logger - logger = logging.getLogger(__name__) @@ -68,5 +66,3 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataF def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame: df.with_column( pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic')) - - diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 91e55a6..22271cf 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -3,39 +3,35 @@ import polars as pl +from pgscatalog_utils.config import POLARS_MAX_THREADS from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target from pgscatalog_utils.target import Target logger = logging.getLogger(__name__) -def read_target(path: str, remove_multiallelic: bool) -> pl.DataFrame: +def read_target(path: str, remove_multiallelic: bool, low_memory: bool) -> pl.LazyFrame: """ Read one or more targets from a path (may contain a wildcard) """ if '*' in path: logger.debug("Wildcard detected in target path: finding all matching files") paths: list[str] = glob.glob(path) else: - logger.debug("") + logger.debug("Found one matching target") paths: list[str] = [path] - targets: list[Target] = [Target.from_path(x) for x in paths] - dfs: list[pl.DataFrame] = [] - for target in targets: - assert target.file_format in ['bim', 'pvar'] - dfs.append(target.read()) + targets: list[Target] = [Target.from_path(x, low_memory) for x in paths] logger.debug("Reading all target data complete") - # explicitly rechunk now, because reading is complete and the input data were read unchunked to save memory - # only pipe functions once rechunking has happened to improve speed # handling multiallelic requires str methods, so don't forget to cast back or matching will break - return (pl.concat(dfs, rechunk=True) + return (pl.concat([x.read() for x in targets]) + .lazy() .pipe(filter_target) .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic) .with_column(pl.col('ALT').cast(pl.Categorical))) -def read_scorefile(path: str) -> pl.DataFrame: +def read_scorefile(path: str) -> pl.LazyFrame: logger.debug("Reading scorefile") dtypes = {'chr_name': pl.Categorical, 'chr_position': pl.UInt64, @@ -43,11 +39,10 @@ def read_scorefile(path: str) -> pl.DataFrame: 'other_allele': pl.Utf8, 'effect_type': pl.Categorical, 'accession': pl.Categorical} - return (pl.scan_csv(path, sep='\t', dtype=dtypes) - .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele']) - .with_columns([ + return (pl.read_csv(path, sep='\t', dtype=dtypes, n_threads=POLARS_MAX_THREADS) + .lazy() + .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])).with_columns([ pl.col("effect_allele").cast(pl.Categorical), pl.col("other_allele").cast(pl.Categorical), pl.col("effect_allele_FLIP").cast(pl.Categorical), - pl.col("other_allele_FLIP").cast(pl.Categorical) - ])) + pl.col("other_allele_FLIP").cast(pl.Categorical)]) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 5b30fda..318d420 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -5,7 +5,7 @@ import pandas as pd -from pgscatalog_utils.log_config import set_logging_level +from pgscatalog_utils.config import set_logging_level from pgscatalog_utils.scorefile.effect_type import set_effect_type from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights from pgscatalog_utils.scorefile.genome_build import build2GRC diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index c3fa792..9fd662d 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -1,8 +1,15 @@ -import zstandard -from dataclasses import dataclass +import gc import io import logging +import os +from dataclasses import dataclass +from itertools import islice +from tempfile import TemporaryDirectory + import polars as pl +import zstandard + +from pgscatalog_utils.config import POLARS_MAX_THREADS logger = logging.getLogger(__name__) @@ -13,9 +20,10 @@ class Target: file_format: str = None path: str = None compressed: bool = False + low_memory: bool = True # targets can be big, and use a lot of RAM when reading @classmethod - def from_path(cls, path): + def from_path(cls, path, low_memory): """ Create a Target object from a path. Cheaply detect file format and headers. """ try: with open(path, 'r') as f: @@ -30,93 +38,130 @@ def from_path(cls, path): file_format = _get_format(text_stream) compressed = True - return cls(file_format=file_format, path=path, compressed=compressed) + return cls(file_format=file_format, path=path, compressed=compressed, low_memory=low_memory) - #@profile def read(self): - if self.compressed: - return self._read_compressed_chunks().lazy() + if self.low_memory: + if self.compressed: + logger.debug("Reading compressed chunks from target genome (slower, lower RAM usage)") + return self._read_compressed_chunks() + else: + logger.debug("Reading uncompressed chunks from target genome (slower, lower RAM usage)") + return self._read_uncompressed_chunks() else: - return self._read_uncompressed_chunks().lazy() - - def _read_uncompressed_chunks(self): - """ Read a CSV using a BufferedIOReader. This is a bit slower than pl.read_csv() (30s vs 5s). - - Lots of testing showed that lazy scanning and native polars reading used a lot of RAM, then freed a bunch. - Plotting RAM usage against time looked like a spiky hedgehog. - - This function linearly consumes RAM in a more linear way by: - 1. Reading a batch of lines - 2. Dropping unused columns - 3. Setting categorical dtypes on read - 4. Don't rechunk until later - """ - logger.debug("Started reading uncompressed chunks") - - df_lst = [] + if self.compressed: + logger.debug("Reading compressed target genome (fast mode, high RAM usage)") + return self._read_compressed() + else: + logger.debug("Reading uncompressed target genome (fast mode, high RAM usage)") + return self._read_uncompressed() + + def _read_compressed(self) -> pl.DataFrame: + """ Read a zst compressed target as quickly as possible """ + with open(self.path, 'rb') as fh: + dctx = zstandard.ZstdDecompressor() + with dctx.stream_reader(fh) as reader: + dtypes = _get_col_dtypes(self.file_format) + col_idxs = _get_default_col_idx(self.file_format) + new_col_names = _default_cols() + return (pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', + dtype=dtypes, + columns=col_idxs, + new_columns=new_col_names, + n_threads=POLARS_MAX_THREADS)) + + def _read_uncompressed(self) -> pl.DataFrame: + """ Read an uncompressed target as quickly as possible. Uses up to 16GB RAM on 1000 genomes pvar. """ dtypes = _get_col_dtypes(self.file_format) col_idxs = _get_default_col_idx(self.file_format) new_col_names = _default_cols() + return (pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', + dtype=dtypes, + columns=col_idxs, + new_columns=new_col_names, + n_threads=POLARS_MAX_THREADS)) - with open(self.path, "rb") as f: - while True: - buffer = b''.join(f.readlines(int(1e6))) + def _read_uncompressed_chunks(self) -> pl.DataFrame: + """ Read a CSV using a BufferedReader in batches to reduce memory usage. - if not buffer: - break + Reads 1 million variant chunks and immediately writes to feather format in a temporary directory. - df = (pl.read_csv(buffer, sep='\t', has_header=False, comment_char='#', n_threads=1, - dtype=dtypes, - columns=col_idxs, - new_columns=new_col_names, - rechunk=False)) + Read all temporary feather files and return a big pl.DataFrame. Reading feather is fast, and preserves dtypes. - df_lst.append(df) + Uses ~ 2GB + """ + dtypes = _get_col_dtypes(self.file_format) + col_idxs = _get_default_col_idx(self.file_format) + new_col_names = _default_cols() + with TemporaryDirectory() as temp_dir: + batch_n = 0 + batch_size = int(1e6) + with open(self.path, 'rb') as f: + while True: + line_batch = b''.join(islice(f, batch_size)) + if not line_batch: + break + + out_path = os.path.join(temp_dir, str(batch_n) + '.ipc') + + (pl.read_csv(line_batch, sep='\t', has_header=False, comment_char='#', + dtype=dtypes, + columns=col_idxs, + new_columns=new_col_names, + n_threads=POLARS_MAX_THREADS).write_ipc(out_path)) + batch_n += 1 - logger.debug("Finished reading uncompressed chunks") - logger.debug("Concatenating chunked data frames") - return pl.concat(df_lst, rechunk=False) + gc.collect() # just to be safe + logger.debug(f"{batch_n} batches staged in temporary directory {temp_dir}") + return pl.read_ipc(os.path.join(temp_dir, "*.ipc")) - def _read_compressed_chunks(self): + def _read_compressed_chunks(self) -> pl.DataFrame: """ Like _read_uncompressed_chunks, but read chunks of bytes and handle incomplete rows zstd returns chunks of bytes, not lines, but encoding utf-8 will be faster in rust and polars """ logger.debug("Started reading zstd compressed data") - df_lst = [] dtypes = _get_col_dtypes(self.file_format) columns = _get_default_col_idx(self.file_format) new_col_names = _default_cols() - with open(self.path, 'rb') as fh: - dctx = zstandard.ZstdDecompressor() - chunk_buffer = b'' + n_chunks = 0 - for chunk in dctx.read_to_iter(fh, read_size=int(1e+8)): # read 100MB of compressed data per chunk - if not chunk: - break - - end = chunk.rfind(b'\n') + 1 # only want to read complete rows - if chunk_buffer: - row_chunk = b''.join([chunk_buffer, chunk[:end]]) - chunk_buffer = b'' - else: - row_chunk = chunk[:end] - - df = pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#', n_threads=1, + with TemporaryDirectory() as temp_dir: + with open(self.path, 'rb') as fh: + dctx = zstandard.ZstdDecompressor() + chunk_buffer = b'' + + for chunk in dctx.read_to_iter(fh, read_size=int(1e8), write_size=int(1e8)): + if not chunk: + logger.debug("Finished reading zstd compressed chunks") + break + + end = chunk.rfind(b'\n') + 1 # only want to read complete rows, which end in \n + if chunk_buffer: + row_chunk = b''.join([chunk_buffer, chunk[:end]]) + chunk_buffer = b'' + else: + row_chunk = chunk[:end] + + out_path = os.path.join(temp_dir, str(n_chunks) + ".ipc") + (pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#', dtype=dtypes, columns=columns, new_columns=new_col_names, - rechunk=False) - df_lst.append(df) - chunk_buffer = b''.join([chunk_buffer, chunk[end:]]) + n_threads=POLARS_MAX_THREADS) + .write_ipc(out_path)) - logger.debug("Finished reading zstd compressed chunks") - logger.debug("Concatenating chunked data frames") - return pl.concat(df_lst, rechunk=False) + chunk_buffer = b''.join([chunk_buffer, chunk[end:]]) + n_chunks += 1 + + gc.collect() # just to be safe + logger.debug(f"{n_chunks} chunks") # write_size will change n_chunks + return pl.read_ipc(os.path.join(temp_dir, "*.ipc")) def _get_default_col_idx(file_format): + """ Return a list of column integers to keep, assuming plink default column sets """ # import default columns: # ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] match file_format: @@ -130,7 +175,9 @@ def _get_default_col_idx(file_format): def _get_col_dtypes(file_format): - """ Manually set up dtypes. pl.Categorical saves a lot of RAM vs pl.Utf8 """ + """ Manually set up dtypes to save memory. Repeated strings like REF / ALT / CHROM work best as pl.Categorical. + + ID shouldn't be pl.Categorical, or you'll create a massive string cache and waste RAM """ match file_format: case 'bim': # 1. Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name @@ -139,7 +186,7 @@ def _get_col_dtypes(file_format): # 4. Base-pair coordinate (1-based; limited to 231-2) # 5. Allele 1 (corresponding to clear bits in .bed; usually minor) # 6. Allele 2 (corresponding to set bits in .bed; usually major) - d = {'column_1': pl.Categorical, 'column_2': pl.Categorical, 'column_3': pl.Float64, 'column_4': pl.UInt64, + d = {'column_1': pl.Categorical, 'column_2': pl.Utf8, 'column_3': pl.Float64, 'column_4': pl.UInt64, 'column_5': pl.Categorical, 'column_6': pl.Categorical} case 'pvar': # 1. CHROM @@ -150,7 +197,7 @@ def _get_col_dtypes(file_format): # 6. QUAL (phred-scaled quality score for whether the locus is variable at all) # 7. FILTER ('PASS', '.', or semicolon-separated list of failing filter codes) # 8. INFO (semicolon-separated list of flags and key-value pairs, with types declared in header) - d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Categorical, 'column_4': pl.Categorical, + d = {'column_1': pl.Categorical, 'column_2': pl.UInt64, 'column_3': pl.Utf8, 'column_4': pl.Categorical, 'column_5': pl.Utf8, 'column_6': pl.Float32, 'column_7': pl.Utf8, 'column_8': pl.Utf8} # can't cast ALT to cat yet, because of multiallelic variants! case _: @@ -178,4 +225,3 @@ def _get_format(fh) -> str: def _default_cols() -> list[str]: """ Standardise column names in a target genome """ return ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] - diff --git a/poetry.lock b/poetry.lock index b8afbdd..7eb7645 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,11 +1,3 @@ -[[package]] -name = "atomicwrites" -version = "1.4.1" -description = "Atomic file writes." -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - [[package]] name = "attrs" version = "22.1.0" @@ -22,7 +14,7 @@ tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (> [[package]] name = "certifi" -version = "2022.6.15" +version = "2022.9.24" description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false @@ -41,7 +33,7 @@ pycparser = "*" [[package]] name = "charset-normalizer" -version = "2.1.0" +version = "2.1.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" optional = false @@ -122,7 +114,7 @@ woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"] [[package]] name = "idna" -version = "3.3" +version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" category = "main" optional = false @@ -138,7 +130,7 @@ python-versions = "*" [[package]] name = "jq" -version = "1.2.2" +version = "1.3.0" description = "jq is a lightweight and flexible JSON processor." category = "main" optional = false @@ -185,7 +177,7 @@ psutil = "*" [[package]] name = "numpy" -version = "1.23.1" +version = "1.23.3" description = "NumPy is the fundamental package for array computing with Python." category = "main" optional = false @@ -204,7 +196,7 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" [[package]] name = "pandas" -version = "1.4.3" +version = "1.5.0" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = false @@ -216,7 +208,7 @@ python-dateutil = ">=2.8.1" pytz = ">=2020.1" [package.extras] -test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +test = ["pytest-xdist (>=1.31)", "pytest (>=6.0)", "hypothesis (>=5.5.3)"] [[package]] name = "pillow" @@ -244,20 +236,21 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.14.9" +version = "0.14.14" description = "Blazingly fast DataFrame library" category = "main" optional = false python-versions = ">=3.7" [package.extras] -pandas = ["pyarrow (>=4.0)", "pandas"] connectorx = ["connectorx"] +pyarrow = ["pyarrow (>=4.0)"] +timezone = ["backports.zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] numpy = ["numpy (>=1.16.0)"] +all = ["polars"] +pandas = ["pyarrow (>=4.0)", "pandas"] fsspec = ["fsspec"] -xlsx2csv = ["xlsx2csv (>=0.8.0)"] -pytz = ["pytz"] -pyarrow = ["pyarrow (>=4.0)"] [[package]] name = "psutil" @@ -315,14 +308,13 @@ python-versions = ">=3" [[package]] name = "pytest" -version = "7.1.2" +version = "7.1.3" description = "pytest: simple powerful testing with Python" category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] -atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} iniconfig = "*" @@ -362,7 +354,7 @@ six = ">=1.5" [[package]] name = "pytz" -version = "2022.1" +version = "2022.2.1" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -429,7 +421,7 @@ python-versions = ">=3.7" [[package]] name = "urllib3" -version = "1.26.11" +version = "1.26.12" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false @@ -437,7 +429,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, [package.extras] brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -460,7 +452,6 @@ python-versions = "^3.10" content-hash = "a0d60a1fec35d248340f1640db49d07a7000b23e4bbe22426a9c240ee499c334" [metadata.files] -atomicwrites = [] attrs = [] certifi = [] cffi = [] @@ -470,10 +461,7 @@ contourpy = [] coverage = [] cycler = [] fonttools = [] -idna = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, -] +idna = [] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, @@ -484,29 +472,7 @@ matplotlib = [] memory-profiler = [] numpy = [] packaging = [] -pandas = [ - {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d51674ed8e2551ef7773820ef5dab9322be0828629f2cbf8d1fc31a0c4fed640"}, - {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ad23db55efcc93fa878f7837267973b61ea85d244fc5ff0ccbcfa5638706c5"}, - {file = "pandas-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:958a0588149190c22cdebbc0797e01972950c927a11a900fe6c2296f207b1d6f"}, - {file = "pandas-1.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e48fbb64165cda451c06a0f9e4c7a16b534fcabd32546d531b3c240ce2844112"}, - {file = "pandas-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f803320c9da732cc79210d7e8cc5c8019aad512589c910c66529eb1b1818230"}, - {file = "pandas-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:2893e923472a5e090c2d5e8db83e8f907364ec048572084c7d10ef93546be6d1"}, - {file = "pandas-1.4.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:24ea75f47bbd5574675dae21d51779a4948715416413b30614c1e8b480909f81"}, - {file = "pandas-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d5ebc990bd34f4ac3c73a2724c2dcc9ee7bf1ce6cf08e87bb25c6ad33507e318"}, - {file = "pandas-1.4.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d6c0106415ff1a10c326c49bc5dd9ea8b9897a6ca0c8688eb9c30ddec49535ef"}, - {file = "pandas-1.4.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78b00429161ccb0da252229bcda8010b445c4bf924e721265bec5a6e96a92e92"}, - {file = "pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfbf16b1ea4f4d0ee11084d9c026340514d1d30270eaa82a9f1297b6c8ecbf0"}, - {file = "pandas-1.4.3-cp38-cp38-win32.whl", hash = "sha256:48350592665ea3cbcd07efc8c12ff12d89be09cd47231c7925e3b8afada9d50d"}, - {file = "pandas-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:605d572126eb4ab2eadf5c59d5d69f0608df2bf7bcad5c5880a47a20a0699e3e"}, - {file = "pandas-1.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a3924692160e3d847e18702bb048dc38e0e13411d2b503fecb1adf0fcf950ba4"}, - {file = "pandas-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07238a58d7cbc8a004855ade7b75bbd22c0db4b0ffccc721556bab8a095515f6"}, - {file = "pandas-1.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:755679c49460bd0d2f837ab99f0a26948e68fa0718b7e42afbabd074d945bf84"}, - {file = "pandas-1.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41fc406e374590a3d492325b889a2686b31e7a7780bec83db2512988550dadbf"}, - {file = "pandas-1.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d9382f72a4f0e93909feece6fef5500e838ce1c355a581b3d8f259839f2ea76"}, - {file = "pandas-1.4.3-cp39-cp39-win32.whl", hash = "sha256:0daf876dba6c622154b2e6741f29e87161f844e64f84801554f879d27ba63c0d"}, - {file = "pandas-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e"}, - {file = "pandas-1.4.3.tar.gz", hash = "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c"}, -] +pandas = [] pillow = [] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, @@ -530,10 +496,7 @@ python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, ] -pytz = [ - {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, - {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, -] +pytz = [] requests = [] setuptools-scm = [] six = [ From 5c15a67a32889c59b376d956a66522ab61b6b53e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 3 Oct 2022 10:15:39 +0100 Subject: [PATCH 26/46] fix reading bim files --- pgscatalog_utils/match/match.py | 1 + pgscatalog_utils/match/match_variants.py | 5 +++ pgscatalog_utils/target.py | 46 +++++++++++------------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 7a9e0f3..d0aeccf 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -10,6 +10,7 @@ logger = logging.getLogger(__name__) +# @profile # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, low_memory: bool) -> pl.DataFrame: scorefile_oa = scorefile.filter(pl.col("other_allele") != None) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index e85d154..187f436 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,6 +1,7 @@ import argparse import logging import os +import sys import textwrap from glob import glob @@ -29,6 +30,10 @@ def match_variants(): n_target_files = len(glob(args.target)) matches: pl.DataFrame + if n_target_files == 0: + logger.critical("No target genomes found, check the path") + sys.exit(1) + if n_target_files == 1 and not args.fast: low_memory: bool = True match_mode: str = 'single' diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index 9fd662d..fbbcb8f 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -40,6 +40,7 @@ def from_path(cls, path, low_memory): return cls(file_format=file_format, path=path, compressed=compressed, low_memory=low_memory) + # @profile # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling def read(self): if self.low_memory: if self.compressed: @@ -62,8 +63,7 @@ def _read_compressed(self) -> pl.DataFrame: dctx = zstandard.ZstdDecompressor() with dctx.stream_reader(fh) as reader: dtypes = _get_col_dtypes(self.file_format) - col_idxs = _get_default_col_idx(self.file_format) - new_col_names = _default_cols() + col_idxs, new_col_names = _default_cols(self.file_format) return (pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', dtype=dtypes, columns=col_idxs, @@ -73,8 +73,7 @@ def _read_compressed(self) -> pl.DataFrame: def _read_uncompressed(self) -> pl.DataFrame: """ Read an uncompressed target as quickly as possible. Uses up to 16GB RAM on 1000 genomes pvar. """ dtypes = _get_col_dtypes(self.file_format) - col_idxs = _get_default_col_idx(self.file_format) - new_col_names = _default_cols() + col_idxs, new_col_names = _default_cols(self.file_format) return (pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', dtype=dtypes, columns=col_idxs, @@ -91,8 +90,7 @@ def _read_uncompressed_chunks(self) -> pl.DataFrame: Uses ~ 2GB """ dtypes = _get_col_dtypes(self.file_format) - col_idxs = _get_default_col_idx(self.file_format) - new_col_names = _default_cols() + col_idxs, new_col_names = _default_cols(self.file_format) with TemporaryDirectory() as temp_dir: batch_n = 0 batch_size = int(1e6) @@ -122,8 +120,7 @@ def _read_compressed_chunks(self) -> pl.DataFrame: """ logger.debug("Started reading zstd compressed data") dtypes = _get_col_dtypes(self.file_format) - columns = _get_default_col_idx(self.file_format) - new_col_names = _default_cols() + columns, new_col_names = _default_cols(self.file_format) n_chunks = 0 @@ -160,20 +157,6 @@ def _read_compressed_chunks(self) -> pl.DataFrame: return pl.read_ipc(os.path.join(temp_dir, "*.ipc")) -def _get_default_col_idx(file_format): - """ Return a list of column integers to keep, assuming plink default column sets """ - # import default columns: - # ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] - match file_format: - case 'bim': - return [0, 1, 3, 4, 5] # see _get_col_dtypes, dropping centimorgans - case 'pvar': - return [0, 1, 2, 3, 4] # dropping QUAL FILTER INFO etc - case _: - logger.critical("Trying to get column idx for an invalid file format, TWENTY THREE NINETEEN") - raise Exception - - def _get_col_dtypes(file_format): """ Manually set up dtypes to save memory. Repeated strings like REF / ALT / CHROM work best as pl.Categorical. @@ -187,7 +170,7 @@ def _get_col_dtypes(file_format): # 5. Allele 1 (corresponding to clear bits in .bed; usually minor) # 6. Allele 2 (corresponding to set bits in .bed; usually major) d = {'column_1': pl.Categorical, 'column_2': pl.Utf8, 'column_3': pl.Float64, 'column_4': pl.UInt64, - 'column_5': pl.Categorical, 'column_6': pl.Categorical} + 'column_5': pl.Categorical, 'column_6': pl.Utf8} case 'pvar': # 1. CHROM # 2. POS (base-pair coordinate) @@ -222,6 +205,17 @@ def _get_format(fh) -> str: return file_format -def _default_cols() -> list[str]: - """ Standardise column names in a target genome """ - return ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] +def _default_cols(file_format) -> tuple[list[int], list[str]]: + """ Return a list of column integers to keep, assuming plink default column sets """ + match file_format: + case 'bim': + idxs = [0, 1, 3, 4, 5] # see _get_col_dtypes, dropping centimorgans + names = ['#CHROM', 'ID', 'POS', 'REF', 'ALT'] # technically A1/A2, but it's ok + return idxs, names + case 'pvar': + idxs = [0, 1, 2, 3, 4] # dropping QUAL FILTER INFO etc + names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT'] + return idxs, names + case _: + logger.critical("Trying to get column idx for an invalid file format, TWENTY THREE NINETEEN") + raise Exception From 0dc745dada2bd18a4bc4b47ab66933d7ff1c0210 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 3 Oct 2022 11:50:51 +0100 Subject: [PATCH 27/46] fix tests --- tests/match/test_label.py | 12 ++++++------ tests/match/test_match.py | 32 +++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/tests/match/test_label.py b/tests/match/test_label.py index 8198335..bf354bd 100644 --- a/tests/match/test_label.py +++ b/tests/match/test_label.py @@ -29,7 +29,7 @@ def test_label(small_scorefile, small_target): scorefile, target = _cast_cat(small_scorefile, small_target) # get_all_matches calls label_matches - labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False) + labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False).collect() logger.debug(labelled.select(['ID', 'match_type', 'best_match', 'ambiguous', 'match_status', 'exclude'])) @@ -43,7 +43,7 @@ def test_ambiguous_label(small_flipped_scorefile, small_target): """ Test ambiguous variant labels change when they're kept for match candidates with one match per position """ scorefile, target = _cast_cat(small_flipped_scorefile, small_target) - no_ambiguous = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False) + no_ambiguous = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False).collect() assert no_ambiguous['best_match'].to_list() == [True] assert no_ambiguous['ambiguous'].to_list() == [True] @@ -51,7 +51,7 @@ def test_ambiguous_label(small_flipped_scorefile, small_target): assert no_ambiguous['match_status'].to_list() == ["excluded"] # otherwise, ambiguous variants are kept - labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) + labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False).collect() assert labelled['best_match'].to_list() == [True] assert labelled['ambiguous'].to_list() == [True] @@ -105,7 +105,7 @@ def duplicated_matches(small_scorefile, small_target, request): scorefile, target = _cast_cat(dups, small_target) - return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=request.param) + return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=request.param).collect() @pytest.fixture @@ -113,7 +113,7 @@ def multiple_match_types(small_target, small_scorefile): # skip flip will return two candidate matches for one target position: refalt + refalt_flip scorefile, target = _cast_cat(small_scorefile, small_target) return (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('chr_name') == 2)) + .filter(pl.col('chr_name') == '2')).collect() @pytest.fixture @@ -122,4 +122,4 @@ def duplicate_best_match(small_target, small_scorefile_no_oa): odd_target = {'#CHROM': [1, 1], 'POS': [1, 1], 'REF': ['T', 'C'], 'ALT': ['A', 'A'], 'ID': ['1:1:T:C', '1:1:A:A'], 'is_multiallelic': [False, False]} scorefile, target = _cast_cat(small_scorefile_no_oa, pl.DataFrame(odd_target)) - return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) + return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False).collect() diff --git a/tests/match/test_match.py b/tests/match/test_match.py index 2c1c8f4..b8fbb07 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -5,7 +5,7 @@ import polars as pl import pytest -from pgscatalog_utils.match.match import get_all_matches, _cast_categorical +from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.match_variants import match_variants @@ -38,9 +38,23 @@ def test_match_pass(mini_scorefile, target_path, tmp_path): match_variants() -def _cast_cat(scorefile, target): +def _cast_cat(scorefile, target) -> tuple[pl.LazyFrame, pl.LazyFrame]: with pl.StringCache(): - return _cast_categorical(scorefile, target) + scorefile = scorefile.with_columns([ + pl.col("chr_name").cast(pl.Utf8).cast(pl.Categorical), + pl.col("effect_allele").cast(pl.Categorical), + pl.col("other_allele").cast(pl.Categorical), + pl.col("effect_type").cast(pl.Categorical), + pl.col("effect_allele_FLIP").cast(pl.Categorical), + pl.col("other_allele_FLIP").cast(pl.Categorical), + pl.col("accession").cast(pl.Categorical) + ]) + target = target.with_columns([ + pl.col("#CHROM").cast(pl.Utf8).cast(pl.Categorical), + pl.col("REF").cast(pl.Categorical), + pl.col("ALT").cast(pl.Categorical) + ]) + return scorefile.lazy(), target.lazy() def test_match_strategies(small_scorefile, small_target): @@ -48,13 +62,13 @@ def test_match_strategies(small_scorefile, small_target): # check unambiguous matches df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == False)) + .filter(pl.col('ambiguous') == False)).collect() assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'}) assert set(df['match_type'].to_list()).issubset(['altref', 'refalt']) # when keeping ambiguous and flipping alleles flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == True)) + .filter(pl.col('ambiguous') == True)).collect() assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'altref', 'refalt_flip'}) @@ -64,14 +78,14 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == False)) + .filter(pl.col('ambiguous') == False)).collect() assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref']) # check ambiguous matches flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == True)) + .filter(pl.col('ambiguous') == True)).collect() assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'no_oa_alt', 'no_oa_ref_flip'}) @@ -79,12 +93,12 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): def test_flip_match(small_flipped_scorefile, small_target): scorefile, target = _cast_cat(small_flipped_scorefile, small_target) - df = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) + df = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False).collect() assert set(df['ambiguous']) == {True} assert set(df['match_type']) == {'refalt'} flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == False)) + .filter(pl.col('ambiguous') == False)).collect() assert flip['match_type'].str.contains('flip').all() assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) From ba793fbf3194381b3a967cbfd1aba8d82ff2295d Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 3 Oct 2022 11:51:06 +0100 Subject: [PATCH 28/46] fix types --- pgscatalog_utils/match/match.py | 8 ++++---- pgscatalog_utils/match/preprocess.py | 13 ++----------- pgscatalog_utils/match/read.py | 3 +-- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index d0aeccf..049da3a 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -12,7 +12,7 @@ # @profile # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bool, remove_ambiguous: bool, - keep_first_match: bool, low_memory: bool) -> pl.DataFrame: + keep_first_match: bool, low_memory: bool = True) -> pl.LazyFrame: scorefile_oa = scorefile.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None) @@ -42,10 +42,10 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bo logger.debug("Collecting all matches (parallel)") match_lf = pl.concat(pl.collect_all(matches)) - return match_lf.pipe(label_matches, remove_ambiguous, keep_first_match) + return match_lf.lazy().pipe(label_matches, remove_ambiguous, keep_first_match) -def _batch_collect(matches: list[pl.LazyFrame]): +def _batch_collect(matches: list[pl.LazyFrame]) -> pl.DataFrame: """ A slower alternative to pl.collect_all(), but this approach will use less peak memory This batches the .collect() and writes intermediate results to a temporary working directory @@ -59,7 +59,7 @@ def _batch_collect(matches: list[pl.LazyFrame]): n_chunks += 1 logger.debug(f"Staged {n_chunks} match chunks to {temp_dir}") gc.collect() - return pl.read_ipc(os.path.join(temp_dir, "*.ipc")).lazy() + return pl.read_ipc(os.path.join(temp_dir, "*.ipc")) def _match_variants(scorefile: pl.LazyFrame, target: pl.LazyFrame, match_type: str) -> pl.LazyFrame: diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 3f0c38d..de2711f 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -5,7 +5,7 @@ logger = logging.getLogger(__name__) -def filter_target(df): +def filter_target(df: pl.DataFrame) -> pl.DataFrame: """ Remove variants that won't be matched against the scorefile Chromosomes 1 - 22, X, and Y with an efficient join. Remmove variants with missing identifiers also @@ -45,11 +45,7 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataF .otherwise(pl.lit(False)) .alias('is_multiallelic'))) - multiallelic_canary = (df.filter(pl.col('is_multiallelic') == True) - .limit(1) # just detect the first occurring - .collect()) - - if not multiallelic_canary.is_empty(): + if (df.get_column('is_multiallelic')).any(): logger.debug("Multiallelic variants detected") if remove_multiallelic: logger.debug('Dropping multiallelic variants') @@ -61,8 +57,3 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataF else: logger.debug("No multiallelic variants detected") return df - - -def _annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame: - df.with_column( - pl.when(pl.col("ALT").str.contains(',')).then(pl.lit(True)).otherwise(pl.lit(False)).alias('is_multiallelic')) diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 22271cf..6bdcfc5 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -25,10 +25,9 @@ def read_target(path: str, remove_multiallelic: bool, low_memory: bool) -> pl.La logger.debug("Reading all target data complete") # handling multiallelic requires str methods, so don't forget to cast back or matching will break return (pl.concat([x.read() for x in targets]) - .lazy() .pipe(filter_target) .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic) - .with_column(pl.col('ALT').cast(pl.Categorical))) + .with_column(pl.col('ALT').cast(pl.Categorical))).lazy() def read_scorefile(path: str) -> pl.LazyFrame: From a171e5b9cde0db9ab315ebdb2b76ffb359a82ab1 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 4 Oct 2022 13:42:32 +0100 Subject: [PATCH 29/46] update poetry lock file --- poetry.lock | 73 +++++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 47 deletions(-) diff --git a/poetry.lock b/poetry.lock index 0d15470..2ae26df 100644 --- a/poetry.lock +++ b/poetry.lock @@ -70,7 +70,7 @@ bokeh = ["selenium", "bokeh"] [[package]] name = "coverage" -version = "6.4.4" +version = "6.5.0" description = "Code coverage measurement for Python" category = "dev" optional = false @@ -92,7 +92,7 @@ python-versions = ">=3.6" [[package]] name = "fonttools" -version = "4.37.3" +version = "4.37.4" description = "Tools to manipulate font files" category = "dev" optional = false @@ -187,7 +187,7 @@ python-versions = ">=3.8" name = "packaging" version = "21.3" description = "Core utilities for Python packages" -category = "dev" +category = "main" optional = false python-versions = ">=3.6" @@ -210,18 +210,6 @@ pytz = ">=2020.1" [package.extras] test = ["pytest-xdist (>=1.31)", "pytest (>=6.0)", "hypothesis (>=5.5.3)"] -[[package]] -name = "pillow" -version = "9.2.0" -description = "Python Imaging Library (Fork)" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"] -tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] - [[package]] name = "pandas-schema" version = "0.3.6" @@ -235,6 +223,18 @@ numpy = "*" packaging = "*" pandas = ">=0.19" +[[package]] +name = "pillow" +version = "9.2.0" +description = "Python Imaging Library (Fork)" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-issues (>=3.0.1)", "sphinx-removed-in", "sphinxext-opengraph"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] + [[package]] name = "pluggy" version = "1.0.0" @@ -249,21 +249,22 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.14.14" +version = "0.14.17" description = "Blazingly fast DataFrame library" category = "main" optional = false python-versions = ">=3.7" [package.extras] +pandas = ["pyarrow (>=4.0.0)", "pandas"] connectorx = ["connectorx"] -pyarrow = ["pyarrow (>=4.0)"] -timezone = ["backports.zoneinfo", "tzdata"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] +timezone = ["backports.zoneinfo", "tzdata"] +matplotlib = ["matplotlib"] +fsspec = ["fsspec"] numpy = ["numpy (>=1.16.0)"] all = ["polars"] -pandas = ["pyarrow (>=4.0)", "pandas"] -fsspec = ["fsspec"] +pyarrow = ["pyarrow (>=4.0.0)"] [[package]] name = "psutil" @@ -304,7 +305,7 @@ python-versions = "*" name = "pyparsing" version = "3.0.9" description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "dev" +category = "main" optional = false python-versions = ">=3.6.8" @@ -367,7 +368,7 @@ six = ">=1.5" [[package]] name = "pytz" -version = "2022.2.1" +version = "2022.4" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -462,7 +463,7 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "1.1" python-versions = "^3.10" -content-hash = "a0d60a1fec35d248340f1640db49d07a7000b23e4bbe22426a9c240ee499c334" +content-hash = "84b4520b176bb1b892c870fe894814cd05e217a86d7b4fadfa638b91a919bae5" [metadata.files] attrs = [] @@ -485,31 +486,9 @@ matplotlib = [] memory-profiler = [] numpy = [] packaging = [] -pillow = [] -pandas = [ - {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d51674ed8e2551ef7773820ef5dab9322be0828629f2cbf8d1fc31a0c4fed640"}, - {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ad23db55efcc93fa878f7837267973b61ea85d244fc5ff0ccbcfa5638706c5"}, - {file = "pandas-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:958a0588149190c22cdebbc0797e01972950c927a11a900fe6c2296f207b1d6f"}, - {file = "pandas-1.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e48fbb64165cda451c06a0f9e4c7a16b534fcabd32546d531b3c240ce2844112"}, - {file = "pandas-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f803320c9da732cc79210d7e8cc5c8019aad512589c910c66529eb1b1818230"}, - {file = "pandas-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:2893e923472a5e090c2d5e8db83e8f907364ec048572084c7d10ef93546be6d1"}, - {file = "pandas-1.4.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:24ea75f47bbd5574675dae21d51779a4948715416413b30614c1e8b480909f81"}, - {file = "pandas-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d5ebc990bd34f4ac3c73a2724c2dcc9ee7bf1ce6cf08e87bb25c6ad33507e318"}, - {file = "pandas-1.4.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d6c0106415ff1a10c326c49bc5dd9ea8b9897a6ca0c8688eb9c30ddec49535ef"}, - {file = "pandas-1.4.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78b00429161ccb0da252229bcda8010b445c4bf924e721265bec5a6e96a92e92"}, - {file = "pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfbf16b1ea4f4d0ee11084d9c026340514d1d30270eaa82a9f1297b6c8ecbf0"}, - {file = "pandas-1.4.3-cp38-cp38-win32.whl", hash = "sha256:48350592665ea3cbcd07efc8c12ff12d89be09cd47231c7925e3b8afada9d50d"}, - {file = "pandas-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:605d572126eb4ab2eadf5c59d5d69f0608df2bf7bcad5c5880a47a20a0699e3e"}, - {file = "pandas-1.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a3924692160e3d847e18702bb048dc38e0e13411d2b503fecb1adf0fcf950ba4"}, - {file = "pandas-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07238a58d7cbc8a004855ade7b75bbd22c0db4b0ffccc721556bab8a095515f6"}, - {file = "pandas-1.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:755679c49460bd0d2f837ab99f0a26948e68fa0718b7e42afbabd074d945bf84"}, - {file = "pandas-1.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41fc406e374590a3d492325b889a2686b31e7a7780bec83db2512988550dadbf"}, - {file = "pandas-1.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d9382f72a4f0e93909feece6fef5500e838ce1c355a581b3d8f259839f2ea76"}, - {file = "pandas-1.4.3-cp39-cp39-win32.whl", hash = "sha256:0daf876dba6c622154b2e6741f29e87161f844e64f84801554f879d27ba63c0d"}, - {file = "pandas-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e"}, - {file = "pandas-1.4.3.tar.gz", hash = "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c"}, -] +pandas = [] pandas-schema = [] +pillow = [] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, From 593757c0f37e2c652d882758507b275462e0167e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 4 Oct 2022 14:37:56 +0100 Subject: [PATCH 30/46] treat lists of files consistently --- .../aggregate/aggregate_scores.py | 6 +-- pgscatalog_utils/match/match_variants.py | 40 +++++++++---------- pgscatalog_utils/match/read.py | 14 +------ 3 files changed, 23 insertions(+), 37 deletions(-) diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py index 6109a7f..653a81d 100644 --- a/pgscatalog_utils/aggregate/aggregate_scores.py +++ b/pgscatalog_utils/aggregate/aggregate_scores.py @@ -13,7 +13,7 @@ def aggregate_scores(): args = _parse_args() set_logging_level(args.verbose) - df = aggregate(glob.glob(args.scores)) + df = aggregate(list(set(args.scores))) logger.debug("Compressing and writing combined scores") df.to_csv('aggregated_scores.txt.gz', sep='\t', compression='gzip') @@ -78,8 +78,8 @@ def _description_text() -> str: def _parse_args(args=None) -> argparse.Namespace: parser = argparse.ArgumentParser(description=_description_text(), formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('-s', '--scores', dest='scores', required=True, - help=' Path to scorefiles. Use a wildcard (*) to select multiple files.') + parser.add_argument('-s', '--scores', dest='scores', required=True, nargs='+', + help=' List of scorefile paths. Use a wildcard (*) to select multiple files.') parser.add_argument('-o', '--outdir', dest='outdir', required=True, default='scores/', help=' Output directory to store downloaded files') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 187f436..6cc1747 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -27,7 +27,8 @@ def match_variants(): with pl.StringCache(): scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile) - n_target_files = len(glob(args.target)) + target_paths = list(set(args.target)) + n_target_files = len(target_paths) matches: pl.DataFrame if n_target_files == 0: @@ -46,18 +47,20 @@ def match_variants(): match match_mode: case "single": - logger.debug(f"Match mode: {match_mode}") # read one target in chunks - matches: pl.LazyFrame = _match_single_target(args.target, scorefile, args.remove_multiallelic, + logger.debug(f"Match mode: {match_mode}") + # _fast_match with low_memory = True reads one target in chunks + matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args.remove_multiallelic, args.skip_flip, args.remove_ambiguous, args.keep_first_match, low_memory) case "multi": logger.debug(f"Match mode: {match_mode}") # iterate over multiple targets, in chunks - matches: pl.LazyFrame = _match_multiple_targets(args.target, scorefile, args.remove_multiallelic, + matches: pl.LazyFrame = _match_multiple_targets(target_paths, scorefile, args.remove_multiallelic, args.skip_flip, args.remove_ambiguous, args.keep_first_match, low_memory) case "fast": - logger.debug(f"Match mode: {match_mode}") # just read everything into memory for speed - matches: pl.LazyFrame = _fast_match(args.target, scorefile, args.remove_multiallelic, args.skip_flip, + logger.debug(f"Match mode: {match_mode}") + # _fast_match with low_memory = False just read everything into memory for speed + matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args.remove_multiallelic, args.skip_flip, args.remove_ambiguous, args.keep_first_match, low_memory) case _: logger.critical(f"Invalid match mode: {match_mode}") @@ -87,30 +90,23 @@ def _check_target_chroms(target: pl.LazyFrame) -> None: logger.debug("Split target genome contains one chromosome (good)") -def _fast_match(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool, +def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, remove_multiallelic: bool, skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, low_memory: bool) -> pl.LazyFrame: # fast match is fast because: - # 1) all target files are read into memory + # 1) all target files are read into memory without batching # 2) matching occurs without iterating through chromosomes - target: pl.LazyFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic, low_memory=low_memory) - logger.debug("Split target chromosomes not checked with fast match mode") + # when low memory is true and n_targets = 1, fast match is the same as "single" match mode + target: pl.LazyFrame = read_target(paths=target_paths, remove_multiallelic=remove_multiallelic, low_memory=low_memory) return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy() -def _match_single_target(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool, - skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, - low_memory: bool) -> pl.LazyFrame: - target: pl.LazyFrame = read_target(path=target_path, remove_multiallelic=remove_multiallelic, low_memory=low_memory) - return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy() - - -def _match_multiple_targets(target_path: str, scorefile: pl.LazyFrame, remove_multiallelic: bool, +def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, remove_multiallelic: bool, skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, low_memory: bool) -> pl.LazyFrame: matches = [] - for i, loc_target_current in enumerate(glob(target_path)): + for i, loc_target_current in enumerate(target_paths): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') - target: pl.LazyFrame = read_target(path=loc_target_current, remove_multiallelic=remove_multiallelic, + target: pl.LazyFrame = read_target(paths=[loc_target_current], remove_multiallelic=remove_multiallelic, low_memory=low_memory) _check_target_chroms(target) matches.append(get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory)) @@ -171,8 +167,8 @@ def _parse_args(args=None): help=' Label for target genomic dataset') parser.add_argument('-s', '--scorefiles', dest='scorefile', required=True, help=' Combined scorefile path (output of read_scorefiles.py)') - parser.add_argument('-t', '--target', dest='target', required=True, - help=' A table of target genomic variants (.bim format)') + parser.add_argument('-t', '--target', dest='target', required=True, nargs='+', + help=' A list of paths of target genomic variants (.bim format)') parser.add_argument('-f', '--fast', dest='fast', action='store_true', help=' Enable faster matching at the cost of increased RAM usage') parser.add_argument('--split', dest='split', default=False, action='store_true', diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 6bdcfc5..ef12543 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -1,8 +1,7 @@ -import glob + import logging import polars as pl - from pgscatalog_utils.config import POLARS_MAX_THREADS from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target from pgscatalog_utils.target import Target @@ -10,16 +9,7 @@ logger = logging.getLogger(__name__) -def read_target(path: str, remove_multiallelic: bool, low_memory: bool) -> pl.LazyFrame: - """ Read one or more targets from a path (may contain a wildcard) """ - - if '*' in path: - logger.debug("Wildcard detected in target path: finding all matching files") - paths: list[str] = glob.glob(path) - else: - logger.debug("Found one matching target") - paths: list[str] = [path] - +def read_target(paths: list[str], remove_multiallelic: bool, low_memory: bool) -> pl.LazyFrame: targets: list[Target] = [Target.from_path(x, low_memory) for x in paths] logger.debug("Reading all target data complete") From 353d8f2349ebdb197987b91c41fb6232eb7ec523 Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Tue, 4 Oct 2022 15:08:05 +0100 Subject: [PATCH 31/46] Setup a user agent for the download_scorefiles utils (REST API calls to the PGS Catalog) --- pgscatalog_utils/download/download_scorefile.py | 12 +++++++++--- pgscatalog_utils/download/publication.py | 4 ++-- pgscatalog_utils/download/score.py | 17 +++++++++++------ pgscatalog_utils/download/trait.py | 4 ++-- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index f31c7ab..72a643d 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -33,13 +33,17 @@ def download_scorefile() -> None: pgs_lst: list[list[str]] = [] + pgsc_calc_info = None + if args.pgsc_calc: + pgsc_calc_info = args.pgsc_calc + if args.efo: logger.debug("--trait set, querying traits") - pgs_lst = pgs_lst + [query_trait(x) for x in args.efo] + pgs_lst = pgs_lst + [query_trait(x, pgsc_calc_info) for x in args.efo] if args.pgp: logger.debug("--pgp set, querying publications") - pgs_lst = pgs_lst + [query_publication(x) for x in args.pgp] + pgs_lst = pgs_lst + [query_publication(x, pgsc_calc_info) for x in args.pgp] if args.pgs: logger.debug("--id set, querying scores") @@ -47,7 +51,7 @@ def download_scorefile() -> None: pgs_id: list[str] = list(set(reduce(lambda x, y: x + y, pgs_lst))) - urls: dict[str, str] = get_url(pgs_id, args.build) + urls: dict[str, str] = get_url(pgs_id, args.build, pgsc_calc_info) for pgsid, url in urls.items(): logger.debug(f"Downloading {pgsid} from {url}") @@ -135,6 +139,8 @@ def _parse_args(args=None) -> argparse.Namespace: parser.add_argument('-o', '--outdir', dest='outdir', required=True, default='scores/', help=' Output directory to store downloaded files') + parser.add_argument('-c', '--pgsc_calc', dest='pgsc_calc', + help=' Provide information about downloading scoring files via pgsc_calc') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') return parser.parse_args(args) diff --git a/pgscatalog_utils/download/publication.py b/pgscatalog_utils/download/publication.py index 56c7f7b..675b263 100644 --- a/pgscatalog_utils/download/publication.py +++ b/pgscatalog_utils/download/publication.py @@ -6,10 +6,10 @@ logger = logging.getLogger(__name__) -def query_publication(pgp: str) -> list[str]: +def query_publication(pgp: str, user_agent:str = None) -> list[str]: logger.debug("Querying PGS Catalog with publication PGP ID") api: str = f'/publication/{pgp}' - results_json = query_api(api) + results_json = query_api(api, user_agent) if results_json == {} or results_json == None: logger.critical(f"Bad response from PGS Catalog for EFO term: {pgp}") diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py index edad470..4b73916 100644 --- a/pgscatalog_utils/download/score.py +++ b/pgscatalog_utils/download/score.py @@ -1,6 +1,7 @@ import logging import sys +import pgscatalog_utils import jq import requests import time @@ -8,13 +9,13 @@ logger = logging.getLogger(__name__) -def get_url(pgs: list[str], build: str) -> dict[str, str]: +def get_url(pgs: list[str], build: str, user_agent:str = None) -> dict[str, str]: pgs_result: list[str] = [] url_result: list[str] = [] for chunk in _chunker(pgs): try: - response = _parse_json_query(query_score(chunk), build) + response = _parse_json_query(query_score(chunk,user_agent), build) pgs_result = pgs_result + list(response.keys()) url_result = url_result + list(response.values()) except TypeError: @@ -29,13 +30,17 @@ def get_url(pgs: list[str], build: str) -> dict[str, str]: return dict(zip(pgs_result, url_result)) -def query_api(api: str, retry:int = 0) -> dict: +def query_api(api: str, user_agent:str = None, retry:int = 0) -> dict: max_retries = 5 wait = 60 results_json = None rest_url_root = 'https://www.pgscatalog.org/rest' + # Set pgscatalog_utils user agent if none provided + if not user_agent: + user_agent = 'pgscatalog_utils/'+pgscatalog_utils.__version__ try: - r: requests.models.Response = requests.get(rest_url_root+api) + headers = {'User-Agent': user_agent} + r: requests.models.Response = requests.get(rest_url_root+api, headers=headers) r.raise_for_status() results_json = r.json() except requests.exceptions.HTTPError as e: @@ -54,10 +59,10 @@ def query_api(api: str, retry:int = 0) -> dict: return results_json -def query_score(pgs_id: list[str]) -> dict: +def query_score(pgs_id: list[str], user_agent:str = None) -> dict: pgs: str = ','.join(pgs_id) api: str = f'/score/search?pgs_ids={pgs}' - results_json = query_api(api) + results_json = query_api(api, user_agent) return results_json diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py index 83af414..609e3e1 100644 --- a/pgscatalog_utils/download/trait.py +++ b/pgscatalog_utils/download/trait.py @@ -6,10 +6,10 @@ logger = logging.getLogger(__name__) -def query_trait(trait: str) -> list[str]: +def query_trait(trait: str, user_agent:str = None) -> list[str]: logger.debug(f"Querying PGS Catalog with trait {trait}") api: str = f'/trait/{trait}?include_children=1' - results_json = query_api(api) + results_json = query_api(api, user_agent) if results_json == {} or results_json == None: logger.critical(f"Bad response from PGS Catalog for EFO term: {trait}") From cc41b4f7db1bfe829fcbb65203a0973c31df7b12 Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Tue, 4 Oct 2022 15:13:33 +0100 Subject: [PATCH 32/46] Improve library call --- pgscatalog_utils/download/score.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py index 4b73916..3c2bf29 100644 --- a/pgscatalog_utils/download/score.py +++ b/pgscatalog_utils/download/score.py @@ -1,10 +1,10 @@ import logging import sys -import pgscatalog_utils import jq import requests import time +from pgscatalog_utils import __version__ as pgscatalog_utils_version logger = logging.getLogger(__name__) @@ -37,7 +37,7 @@ def query_api(api: str, user_agent:str = None, retry:int = 0) -> dict: rest_url_root = 'https://www.pgscatalog.org/rest' # Set pgscatalog_utils user agent if none provided if not user_agent: - user_agent = 'pgscatalog_utils/'+pgscatalog_utils.__version__ + user_agent = 'pgscatalog_utils/'+pgscatalog_utils_version try: headers = {'User-Agent': user_agent} r: requests.models.Response = requests.get(rest_url_root+api, headers=headers) From 4eec95b2e9fecb70afb2b1723a078a462266b45c Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 4 Oct 2022 16:50:28 +0100 Subject: [PATCH 33/46] don't hold scorefiles in memory when combining them --- .../scorefile/combine_scorefiles.py | 24 +++++++------------ pgscatalog_utils/scorefile/write.py | 15 ++++++++++-- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 318d420..e8dc610 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -1,10 +1,9 @@ import argparse import logging +import os import sys import textwrap -import pandas as pd - from pgscatalog_utils.config import set_logging_level from pgscatalog_utils.scorefile.effect_type import set_effect_type from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights @@ -25,7 +24,10 @@ def combine_scorefiles(): paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") - scorefiles = [] + if os.path.exists(args.outfile): + logger.critical(f"Output file {args.outfile} already exists") + raise Exception + for x in paths: # Read scorefile df and header h, score = load_scorefile(x) @@ -65,19 +67,11 @@ def combine_scorefiles(): logger.error("Try running with --liftover and specifying the --chain_dir") raise Exception - scorefiles.append(score) - - if len(scorefiles) > 0: - scorefiles: pd.DataFrame = pd.concat(scorefiles) - else: - logger.error("No valid scorefiles could be combined") - raise Exception - - if args.liftover: - logger.debug("Annotating scorefiles with liftover parameters") - scorefiles = liftover(scorefiles, args.chain_dir, args.min_lift, args.target_build) + if args.liftover: + logger.debug("Annotating scorefile with liftover parameters") + score = liftover(score, args.chain_dir, args.min_lift, args.target_build) - write_scorefile(scorefiles, args.outfile) + write_scorefile(score, args.outfile) def _description_text() -> str: diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 0dd7b38..175bcab 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -1,4 +1,5 @@ import logging +import os import pandas as pd @@ -9,6 +10,15 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None: cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', 'is_duplicated', 'accession', 'row_nr'] + if os.path.exists(path): + logger.debug("Output file exists: setting write mode to append") + write_mode = 'a' + header = False + else: + logger.debug("Output file doesn't exist: setting write mode to write (create new file)") + write_mode = 'w' + header = True + if df.empty: logger.error("Empty scorefile output! Please check the input data") raise Exception @@ -20,12 +30,13 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None: if 'other_allele' not in out_df: logger.warning("No other allele information detected, writing out as missing data") out_df['other_allele'] = None + if path.endswith('.gz'): logger.debug("Writing out gzip-compressed combined scorefile") - out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip') + out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header) else: logger.debug("Writing out combined scorefile") - out_df[cols].to_csv(path, index=False, sep="\t") + out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header) def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame: From 03699e28ce463cddb9c493117550a3cde485a0d6 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 4 Oct 2022 17:24:39 +0100 Subject: [PATCH 34/46] check if input and outputs are empty in combine_scorefiles --- .../scorefile/combine_scorefiles.py | 8 +++++ pgscatalog_utils/scorefile/write.py | 30 ++++++++----------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index e8dc610..b7adaa9 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -32,6 +32,10 @@ def combine_scorefiles(): # Read scorefile df and header h, score = load_scorefile(x) + if score.empty: + logger.critical(f"Empty scorefile {x} detected! Please check the input data") + raise Exception + # Check if we should use the harmonized positions use_harmonised = False current_build = None @@ -71,6 +75,10 @@ def combine_scorefiles(): logger.debug("Annotating scorefile with liftover parameters") score = liftover(score, args.chain_dir, args.min_lift, args.target_build) + if score.empty: + logger.critical("Empty output score detected, something went wrong while combining") + raise Exception + write_scorefile(score, args.outfile) diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 175bcab..8a3233b 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -19,24 +19,20 @@ def write_scorefile(df: pd.DataFrame, path: str) -> None: write_mode = 'w' header = True - if df.empty: - logger.error("Empty scorefile output! Please check the input data") - raise Exception + out_df: pd.DataFrame = (df.drop('accession', axis=1) + .rename({'filename_prefix': 'accession'}, axis=1) + .pipe(_filter_failed_liftover)) + + if 'other_allele' not in out_df: + logger.warning("No other allele information detected, writing out as missing data") + out_df['other_allele'] = None + + if path.endswith('.gz'): + logger.debug("Writing out gzip-compressed combined scorefile") + out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header) else: - out_df: pd.DataFrame = (df.drop('accession', axis=1) - .rename({'filename_prefix': 'accession'}, axis=1) - .pipe(_filter_failed_liftover)) - - if 'other_allele' not in out_df: - logger.warning("No other allele information detected, writing out as missing data") - out_df['other_allele'] = None - - if path.endswith('.gz'): - logger.debug("Writing out gzip-compressed combined scorefile") - out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header) - else: - logger.debug("Writing out combined scorefile") - out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header) + logger.debug("Writing out combined scorefile") + out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header) def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame: From 50b1517fc54d9277357dcef9b3ca054f0db88038 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Tue, 4 Oct 2022 17:29:30 +0100 Subject: [PATCH 35/46] Handle case where we might be removing the missing variants (not default) --- pgscatalog_utils/scorefile/combine_scorefiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index b7adaa9..bcafa61 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -75,7 +75,7 @@ def combine_scorefiles(): logger.debug("Annotating scorefile with liftover parameters") score = liftover(score, args.chain_dir, args.min_lift, args.target_build) - if score.empty: + if score.empty and (args.drop_missing is False): logger.critical("Empty output score detected, something went wrong while combining") raise Exception From 0a5dfbe3d2541b62bc46189d1fd6259ef1d96659 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 5 Oct 2022 17:45:15 +0100 Subject: [PATCH 36/46] add parameter for n_threads, set POLARS_MAX_THREADS with it --- pgscatalog_utils/config.py | 6 +----- pgscatalog_utils/match/match_variants.py | 13 ++++++++----- pgscatalog_utils/match/read.py | 4 ++-- pgscatalog_utils/target.py | 6 +++--- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index 8bb2a57..7a6b8eb 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -1,10 +1,6 @@ import logging -import os -try: - POLARS_MAX_THREADS: int = int(os.getenv('POLARS_MAX_THREADS')) -except TypeError: - POLARS_MAX_THREADS = 1 # not defined, it's better to be slow than set to n_cores (polars default) +POLARS_MAX_THREADS = 1 # dummy value, is reset by args.n_threads (default: 1) def set_logging_level(verbose: bool): diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 6cc1747..698607e 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -3,11 +3,10 @@ import os import sys import textwrap -from glob import glob import polars as pl -from pgscatalog_utils.config import set_logging_level, POLARS_MAX_THREADS +import pgscatalog_utils.config as config from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.log import make_logs from pgscatalog_utils.match.match import get_all_matches @@ -19,11 +18,14 @@ def match_variants(): args = _parse_args() + config.set_logging_level(args.verbose) - set_logging_level(args.verbose) - logger.debug(f"POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}") + config.POLARS_MAX_THREADS = args.n_threads + os.environ['POLARS_MAX_THREADS'] = str(config.POLARS_MAX_THREADS) + # now the environment variable, parsed argument args.n_threads, and threadpool should agree + logger.debug(f"Setting POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}") + logger.debug(f"Using {config.POLARS_MAX_THREADS} threads to read CSVs") logger.debug(f"polars threadpool size: {pl.threadpool_size()}") - logger.debug(f"Using {POLARS_MAX_THREADS} threads to read CSVs") with pl.StringCache(): scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile) @@ -171,6 +173,7 @@ def _parse_args(args=None): help=' A list of paths of target genomic variants (.bim format)') parser.add_argument('-f', '--fast', dest='fast', action='store_true', help=' Enable faster matching at the cost of increased RAM usage') + parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) parser.add_argument('--split', dest='split', default=False, action='store_true', help=' Split scorefile per chromosome?') parser.add_argument('--outdir', dest='outdir', required=True, diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index ef12543..cab5d80 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -2,8 +2,8 @@ import logging import polars as pl -from pgscatalog_utils.config import POLARS_MAX_THREADS from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target +import pgscatalog_utils.config as config from pgscatalog_utils.target import Target logger = logging.getLogger(__name__) @@ -28,7 +28,7 @@ def read_scorefile(path: str) -> pl.LazyFrame: 'other_allele': pl.Utf8, 'effect_type': pl.Categorical, 'accession': pl.Categorical} - return (pl.read_csv(path, sep='\t', dtype=dtypes, n_threads=POLARS_MAX_THREADS) + return (pl.read_csv(path, sep='\t', dtype=dtypes, n_threads=config.POLARS_MAX_THREADS) .lazy() .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])).with_columns([ pl.col("effect_allele").cast(pl.Categorical), diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index fbbcb8f..ca6755c 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -9,7 +9,7 @@ import polars as pl import zstandard -from pgscatalog_utils.config import POLARS_MAX_THREADS +import pgscatalog_utils.config as config logger = logging.getLogger(__name__) @@ -68,7 +68,7 @@ def _read_compressed(self) -> pl.DataFrame: dtype=dtypes, columns=col_idxs, new_columns=new_col_names, - n_threads=POLARS_MAX_THREADS)) + n_threads=config.POLARS_MAX_THREADS)) def _read_uncompressed(self) -> pl.DataFrame: """ Read an uncompressed target as quickly as possible. Uses up to 16GB RAM on 1000 genomes pvar. """ @@ -78,7 +78,7 @@ def _read_uncompressed(self) -> pl.DataFrame: dtype=dtypes, columns=col_idxs, new_columns=new_col_names, - n_threads=POLARS_MAX_THREADS)) + n_threads=config.POLARS_MAX_THREADS)) def _read_uncompressed_chunks(self) -> pl.DataFrame: """ Read a CSV using a BufferedReader in batches to reduce memory usage. From ca1734e922e84406c03ebe518ca2101ff759c714 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 5 Oct 2022 17:47:30 +0100 Subject: [PATCH 37/46] move dropping multiallelics from preprocessing to labelling --- pgscatalog_utils/match/label.py | 22 ++++++++++++-- pgscatalog_utils/match/match.py | 6 ++-- pgscatalog_utils/match/match_variants.py | 37 +++++++++++++----------- pgscatalog_utils/match/preprocess.py | 14 ++++----- pgscatalog_utils/match/read.py | 6 ++-- 5 files changed, 51 insertions(+), 34 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 072fbb1..9be6316 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -18,8 +18,9 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da labelled = (df.with_column(pl.lit(False).alias('exclude')) # set up dummy exclude column for _label_* .pipe(_label_best_match) .pipe(_label_duplicate_best_match) - .pipe(_label_duplicate_id, keep_first_match) - .pipe(_label_biallelic_ambiguous, remove_ambiguous) + .pipe(_label_duplicate_id, params['keep_first_match']) + .pipe(_label_biallelic_ambiguous, params['remove_ambiguous']) + .pipe(_label_multiallelic, params['remove_multiallelic']) .with_column(pl.lit(True).alias('match_candidate'))) return _encode_match_priority(labelled) @@ -175,3 +176,20 @@ def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFra .with_column(pl.max(["exclude", "exclude_ambiguous"])) .drop(["exclude", "exclude_ambiguous"]) .rename({"max": "exclude"})) + + +def _label_multiallelic(df: pl.LazyFrame, remove_multiallelic: bool) -> pl.LazyFrame: + """ Label multiallelic variants with exclude flag + + (Multiallelic variants are already labelled with the "is_multiallelic" column in match.preprocess) + """ + if remove_multiallelic: + logger.debug("Labelling multiallelic matches with exclude flag") + return df.with_column(pl.when(pl.col('is_multiallelic') == True) + .then(True) + .otherwise(pl.col('exclude')) # don't overwrite existing exclude flags + .alias('exclude')) + else: + logger.debug("Not excluding multiallelic variants") + return df + diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 049da3a..8f79d4c 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -11,8 +11,8 @@ # @profile # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling -def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bool, remove_ambiguous: bool, - keep_first_match: bool, low_memory: bool = True) -> pl.LazyFrame: +def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, label_params: dict[str: bool], + low_memory: bool = True) -> pl.LazyFrame: scorefile_oa = scorefile.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None) @@ -42,7 +42,7 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, skip_flip: bo logger.debug("Collecting all matches (parallel)") match_lf = pl.concat(pl.collect_all(matches)) - return match_lf.lazy().pipe(label_matches, remove_ambiguous, keep_first_match) + return match_lf.lazy().pipe(label_matches, label_params) def _batch_collect(matches: list[pl.LazyFrame]) -> pl.DataFrame: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 698607e..23bce5f 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -51,19 +51,14 @@ def match_variants(): case "single": logger.debug(f"Match mode: {match_mode}") # _fast_match with low_memory = True reads one target in chunks - matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args.remove_multiallelic, - args.skip_flip, args.remove_ambiguous, - args.keep_first_match, low_memory) + matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args, low_memory) case "multi": logger.debug(f"Match mode: {match_mode}") # iterate over multiple targets, in chunks - matches: pl.LazyFrame = _match_multiple_targets(target_paths, scorefile, args.remove_multiallelic, - args.skip_flip, args.remove_ambiguous, - args.keep_first_match, low_memory) + matches: pl.LazyFrame = _match_multiple_targets(target_paths, scorefile, args, low_memory) case "fast": logger.debug(f"Match mode: {match_mode}") # _fast_match with low_memory = False just read everything into memory for speed - matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args.remove_multiallelic, args.skip_flip, - args.remove_ambiguous, args.keep_first_match, low_memory) + matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args, low_memory) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception @@ -92,26 +87,26 @@ def _check_target_chroms(target: pl.LazyFrame) -> None: logger.debug("Split target genome contains one chromosome (good)") -def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, remove_multiallelic: bool, - skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, low_memory: bool) -> pl.LazyFrame: +def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, + args: argparse.Namespace, low_memory: bool) -> pl.LazyFrame: # fast match is fast because: # 1) all target files are read into memory without batching # 2) matching occurs without iterating through chromosomes # when low memory is true and n_targets = 1, fast match is the same as "single" match mode - target: pl.LazyFrame = read_target(paths=target_paths, remove_multiallelic=remove_multiallelic, low_memory=low_memory) - return get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory).lazy() + params: dict[str, bool] = _make_params_dict(args) + target: pl.LazyFrame = read_target(paths=target_paths, low_memory=low_memory) + return get_all_matches(scorefile=scorefile, target=target, label_params=params, low_memory=low_memory).lazy() -def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, remove_multiallelic: bool, - skip_flip: bool, remove_ambiguous: bool, keep_first_match: bool, +def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, args: argparse.Namespace, low_memory: bool) -> pl.LazyFrame: matches = [] + params: dict[str, bool] = _make_params_dict(args) for i, loc_target_current in enumerate(target_paths): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') - target: pl.LazyFrame = read_target(paths=[loc_target_current], remove_multiallelic=remove_multiallelic, - low_memory=low_memory) + target: pl.LazyFrame = read_target(paths=[loc_target_current], low_memory=low_memory) _check_target_chroms(target) - matches.append(get_all_matches(scorefile, target, skip_flip, remove_ambiguous, keep_first_match, low_memory)) + matches.append(get_all_matches(scorefile=scorefile, target=target, label_params=params, low_memory=low_memory)) return pl.concat(matches).lazy() @@ -201,5 +196,13 @@ def _parse_args(args=None): return parser.parse_args(args) +def _make_params_dict(args) -> dict[str, bool]: + """ Make a dictionary with parameters that control labelling match candidates """ + return {'keep_first_match': args.keep_first_match, + 'remove_ambiguous': args.remove_ambiguous, + 'skip_flip': args.skip_flip, + 'remove_multiallelic': args.remove_multiallelic} + + if __name__ == "__main__": match_variants() diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index de2711f..9997176 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -37,7 +37,8 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF return df -def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataFrame: +def annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame: + """ Identify variants that are multiallelic with a column flag """ # plink2 pvar multi-alleles are comma-separated df: pl.DataFrame = (df.with_column( pl.when(pl.col("ALT").str.contains(',')) @@ -46,14 +47,9 @@ def handle_multiallelic(df: pl.DataFrame, remove_multiallelic: bool) -> pl.DataF .alias('is_multiallelic'))) if (df.get_column('is_multiallelic')).any(): - logger.debug("Multiallelic variants detected") - if remove_multiallelic: - logger.debug('Dropping multiallelic variants') - return df.filter(pl.col('is_multiallelic') == False) - else: - logger.debug("Exploding dataframe to handle multiallelic variants") - df.replace('ALT', df['ALT'].str.split(by=',')) # turn ALT to list of variants - return df.explode('ALT') # expand the DF to have all the variants in different rows + logger.debug("Exploding dataframe to handle multiallelic variants") + df.replace('ALT', df['ALT'].str.split(by=',')) # turn ALT to list of variants + return df.explode('ALT') # expand the DF to have all the variants in different rows else: logger.debug("No multiallelic variants detected") return df diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index cab5d80..e7417f1 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -2,21 +2,21 @@ import logging import polars as pl -from pgscatalog_utils.match.preprocess import handle_multiallelic, complement_valid_alleles, filter_target import pgscatalog_utils.config as config +from pgscatalog_utils.match.preprocess import annotate_multiallelic, complement_valid_alleles, filter_target from pgscatalog_utils.target import Target logger = logging.getLogger(__name__) -def read_target(paths: list[str], remove_multiallelic: bool, low_memory: bool) -> pl.LazyFrame: +def read_target(paths: list[str], low_memory: bool) -> pl.LazyFrame: targets: list[Target] = [Target.from_path(x, low_memory) for x in paths] logger.debug("Reading all target data complete") # handling multiallelic requires str methods, so don't forget to cast back or matching will break return (pl.concat([x.read() for x in targets]) .pipe(filter_target) - .pipe(handle_multiallelic, remove_multiallelic=remove_multiallelic) + .pipe(annotate_multiallelic) .with_column(pl.col('ALT').cast(pl.Categorical))).lazy() From 8f1f771624318bfcf9f8ce408801a7ddcc500930 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 5 Oct 2022 17:48:07 +0100 Subject: [PATCH 38/46] move skipping flips from matching to labelling --- pgscatalog_utils/match/label.py | 18 +++++++++++++++++- pgscatalog_utils/match/log.py | 4 ++-- pgscatalog_utils/match/match.py | 10 ++++------ 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 9be6316..bc9b56c 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -7,7 +7,7 @@ logger = logging.getLogger(__name__) -def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.DataFrame: +def label_matches(df: pl.DataFrame, params: dict[str, bool]) -> pl.DataFrame: """ Label match candidates with additional metadata. Column definitions: - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function) @@ -21,6 +21,7 @@ def label_matches(df: pl.DataFrame, remove_ambiguous, keep_first_match) -> pl.Da .pipe(_label_duplicate_id, params['keep_first_match']) .pipe(_label_biallelic_ambiguous, params['remove_ambiguous']) .pipe(_label_multiallelic, params['remove_multiallelic']) + .pipe(_label_flips, params['skip_flip']) .with_column(pl.lit(True).alias('match_candidate'))) return _encode_match_priority(labelled) @@ -193,3 +194,18 @@ def _label_multiallelic(df: pl.LazyFrame, remove_multiallelic: bool) -> pl.LazyF logger.debug("Not excluding multiallelic variants") return df + +def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame: + df = df.with_column(pl.when(pl.col('match_type').str.contains('_FLIP')) + .then(True) + .otherwise(False) + .alias('is_flipped')) + if skip_flip: + logger.debug("Labelling flipped matches with exclude flag") + return df.with_column(pl.when(pl.col('is_flipped') == True) + .then(True) + .otherwise(pl.col('exclude')) # don't overwrite existing exclude flags + .alias('exclude')) + else: + logger.debug("Not excluding flipped matches") + return df \ No newline at end of file diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index ac44084..d2acf42 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -30,8 +30,8 @@ def make_summary_log(best_matches: pl.LazyFrame, filter_summary: pl.LazyFrame) - """ Make an aggregated table """ logger.debug("Aggregating best match log into a summary table") return (best_matches - .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'duplicate_best_match', - 'duplicate_ID']) + .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'is_flipped', + 'duplicate_best_match', 'duplicate_ID']) .agg(pl.count()) .join(filter_summary, how='left', on='accession')) diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 8f79d4c..7022eea 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -24,16 +24,14 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, label_params: logger.debug("Getting matches for scores with effect allele and other allele") matches.append(_match_variants(scorefile=scorefile_oa, target=target, match_type="refalt").select(col_order)) matches.append(_match_variants(scorefile_oa, target, match_type="altref").select(col_order)) - if skip_flip is False: - matches.append(_match_variants(scorefile_oa, target, match_type="refalt_flip").select(col_order)) - matches.append(_match_variants(scorefile_oa, target, match_type="altref_flip").select(col_order)) + matches.append(_match_variants(scorefile_oa, target, match_type="refalt_flip").select(col_order)) + matches.append(_match_variants(scorefile_oa, target, match_type="altref_flip").select(col_order)) logger.debug("Getting matches for scores with effect allele only") matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref").select(col_order)) matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt").select(col_order)) - if skip_flip is False: - matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order)) - matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order)) + matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order)) + matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order)) if low_memory: logger.debug("Batch collecting matches (low memory mode)") From f5b64cfd9a1f3ec428840d4fd9034d463b70dcd7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 6 Oct 2022 16:09:03 +0100 Subject: [PATCH 39/46] is_flipped -> match_flipped, fix uppercase match type --- pgscatalog_utils/match/label.py | 6 +++--- pgscatalog_utils/match/log.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index bc9b56c..357e3f6 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -196,13 +196,13 @@ def _label_multiallelic(df: pl.LazyFrame, remove_multiallelic: bool) -> pl.LazyF def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame: - df = df.with_column(pl.when(pl.col('match_type').str.contains('_FLIP')) + df = df.with_column(pl.when(pl.col('match_type').str.contains('_flip')) .then(True) .otherwise(False) - .alias('is_flipped')) + .alias('match_flipped')) if skip_flip: logger.debug("Labelling flipped matches with exclude flag") - return df.with_column(pl.when(pl.col('is_flipped') == True) + return df.with_column(pl.when(pl.col('match_flipped') == True) .then(True) .otherwise(pl.col('exclude')) # don't overwrite existing exclude flags .alias('exclude')) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index d2acf42..6143308 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -30,7 +30,7 @@ def make_summary_log(best_matches: pl.LazyFrame, filter_summary: pl.LazyFrame) - """ Make an aggregated table """ logger.debug("Aggregating best match log into a summary table") return (best_matches - .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'is_flipped', + .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'match_flipped', 'duplicate_best_match', 'duplicate_ID']) .agg(pl.count()) .join(filter_summary, how='left', on='accession')) From 5b18299540a87645adc528fb99df102634a2a1ec Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 7 Oct 2022 11:23:07 +0100 Subject: [PATCH 40/46] move label_matches from get_all_matches to match_variants --- pgscatalog_utils/match/label.py | 17 +++++++++-------- pgscatalog_utils/match/match.py | 7 ++----- pgscatalog_utils/match/match_variants.py | 9 ++++++--- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 357e3f6..1c55ba3 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -7,7 +7,7 @@ logger = logging.getLogger(__name__) -def label_matches(df: pl.DataFrame, params: dict[str, bool]) -> pl.DataFrame: +def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame: """ Label match candidates with additional metadata. Column definitions: - match_candidate: All input variants that were returned from match.get_all_matches() (always True in this function) @@ -15,6 +15,7 @@ def label_matches(df: pl.DataFrame, params: dict[str, bool]) -> pl.DataFrame: - duplicate: True if more than one best match exists for the same accession and ID - ambiguous: True if ambiguous """ + assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip'} labelled = (df.with_column(pl.lit(False).alias('exclude')) # set up dummy exclude column for _label_* .pipe(_label_best_match) .pipe(_label_duplicate_best_match) @@ -27,7 +28,7 @@ def label_matches(df: pl.DataFrame, params: dict[str, bool]) -> pl.DataFrame: return _encode_match_priority(labelled) -def _encode_match_priority(df: pl.DataFrame) -> pl.DataFrame: +def _encode_match_priority(df: pl.LazyFrame) -> pl.LazyFrame: """ Encode a new column called match status containing matched, unmatched, excluded, and not_best """ return (df.with_columns([ # set false best match to not_best @@ -41,7 +42,7 @@ def _encode_match_priority(df: pl.DataFrame) -> pl.DataFrame: .cast(pl.Categorical)).drop(["max", "excluded_match_priority", "match_priority"])) -def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: +def _label_best_match(df: pl.LazyFrame) -> pl.LazyFrame: """ Best matches have the lowest match priority type. Find the best matches and label them. """ logger.debug("Labelling best match type (refalt > altref > ...)") match_priority = {'refalt': 0, 'altref': 1, 'refalt_flip': 2, 'altref_flip': 3, 'no_oa_ref': 4, 'no_oa_alt': 5, @@ -50,7 +51,7 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: # use a groupby aggregation to guarantee the number of rows stays the same # rows were being lost using an anti join + reduce approach - prioritised: pl.DataFrame = (df.with_column(pl.col('match_type') + prioritised: pl.LazyFrame = (df.with_column(pl.col('match_type') .apply(lambda x: match_priority[x]) .alias('match_priority')) .with_column(pl.col("match_priority") @@ -66,7 +67,7 @@ def _label_best_match(df: pl.DataFrame) -> pl.DataFrame: return prioritised.drop(['match_priority', 'best_match_type']) -def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame: +def _label_duplicate_best_match(df: pl.LazyFrame) -> pl.LazyFrame: """ A scoring file row_nr in an accession group can be duplicated if a target position has different REF, e.g.: ┌────────┬────────────────────────┬────────────┬────────────────┬─────┬────────────┐ @@ -82,7 +83,7 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame: Label the first row with best_match = true, and duplicate rows with best_match = false """ logger.debug("Labelling duplicated best match: keeping first instance as best_match = True") - labelled: pl.DataFrame = (df.with_column(pl.col('best_match') + labelled: pl.LazyFrame = (df.with_column(pl.col('best_match') .count() .over(['accession', 'row_nr', 'best_match']) .alias('count')) @@ -106,7 +107,7 @@ def _label_duplicate_best_match(df: pl.DataFrame) -> pl.DataFrame: return labelled -def _label_duplicate_id(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFrame: +def _label_duplicate_id(df: pl.LazyFrame, keep_first_match: bool) -> pl.LazyFrame: """ Label best match duplicates made when the scoring file is remapped to a different genome build ┌─────────┬────────────────────────┬─────────────┬────────────────┬─────┬────────────┐ @@ -153,7 +154,7 @@ def _label_duplicate_id(df: pl.DataFrame, keep_first_match: bool) -> pl.DataFram .rename({"max": "exclude"})) -def _label_biallelic_ambiguous(df: pl.DataFrame, remove_ambiguous) -> pl.DataFrame: +def _label_biallelic_ambiguous(df: pl.LazyFrame, remove_ambiguous) -> pl.LazyFrame: logger.debug("Labelling ambiguous variants") ambig = ((df.with_columns([ pl.col(["effect_allele", "other_allele", "REF", "ALT", "effect_allele_FLIP", "other_allele_FLIP"]).cast(str), diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 7022eea..4363dd5 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -5,14 +5,11 @@ import polars as pl -from pgscatalog_utils.match.label import label_matches - logger = logging.getLogger(__name__) # @profile # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling -def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, label_params: dict[str: bool], - low_memory: bool = True) -> pl.LazyFrame: +def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, low_memory: bool = True) -> pl.LazyFrame: scorefile_oa = scorefile.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None) @@ -40,7 +37,7 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, label_params: logger.debug("Collecting all matches (parallel)") match_lf = pl.concat(pl.collect_all(matches)) - return match_lf.lazy().pipe(label_matches, label_params) + return match_lf.lazy() def _batch_collect(matches: list[pl.LazyFrame]) -> pl.DataFrame: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 23bce5f..1fc322d 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -8,6 +8,7 @@ import pgscatalog_utils.config as config from pgscatalog_utils.match.filter import filter_scores +from pgscatalog_utils.match.label import label_matches from pgscatalog_utils.match.log import make_logs from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.read import read_target, read_scorefile @@ -95,7 +96,8 @@ def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, # when low memory is true and n_targets = 1, fast match is the same as "single" match mode params: dict[str, bool] = _make_params_dict(args) target: pl.LazyFrame = read_target(paths=target_paths, low_memory=low_memory) - return get_all_matches(scorefile=scorefile, target=target, label_params=params, low_memory=low_memory).lazy() + return (get_all_matches(scorefile=scorefile, target=target, low_memory=low_memory) + .pipe(label_matches, params=params)) def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, args: argparse.Namespace, @@ -106,8 +108,9 @@ def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, ar logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.LazyFrame = read_target(paths=[loc_target_current], low_memory=low_memory) _check_target_chroms(target) - matches.append(get_all_matches(scorefile=scorefile, target=target, label_params=params, low_memory=low_memory)) - return pl.concat(matches).lazy() + matches.append(get_all_matches(scorefile=scorefile, target=target, low_memory=low_memory)) + return (pl.concat(matches) + .pipe(label_params=params)) def _description_text() -> str: From df44d9becaa5bc19add8fb81f32838c754239e96 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 7 Oct 2022 11:23:28 +0100 Subject: [PATCH 41/46] fix setting n_threads when reading --- pgscatalog_utils/target.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index ca6755c..3573ee6 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -106,7 +106,7 @@ def _read_uncompressed_chunks(self) -> pl.DataFrame: dtype=dtypes, columns=col_idxs, new_columns=new_col_names, - n_threads=POLARS_MAX_THREADS).write_ipc(out_path)) + n_threads=config.POLARS_MAX_THREADS).write_ipc(out_path)) batch_n += 1 gc.collect() # just to be safe @@ -146,7 +146,7 @@ def _read_compressed_chunks(self) -> pl.DataFrame: dtype=dtypes, columns=columns, new_columns=new_col_names, - n_threads=POLARS_MAX_THREADS) + n_threads=config.POLARS_MAX_THREADS) .write_ipc(out_path)) chunk_buffer = b''.join([chunk_buffer, chunk[end:]]) From be96d14d325567c6fcff8f7601c5617fd147ba3f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 7 Oct 2022 11:23:36 +0100 Subject: [PATCH 42/46] fix tests --- tests/match/test_label.py | 86 ++++++++++++++++++++++++++------------- tests/match/test_match.py | 51 ++++++++++++++++------- 2 files changed, 94 insertions(+), 43 deletions(-) diff --git a/tests/match/test_label.py b/tests/match/test_label.py index bf354bd..ebe0c43 100644 --- a/tests/match/test_label.py +++ b/tests/match/test_label.py @@ -4,6 +4,7 @@ import pytest import polars as pl +from pgscatalog_utils.match.label import label_matches from pgscatalog_utils.match.match import get_all_matches from tests.match.test_match import _cast_cat @@ -29,37 +30,54 @@ def test_label(small_scorefile, small_target): scorefile, target = _cast_cat(small_scorefile, small_target) # get_all_matches calls label_matches - labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False).collect() + params = {'skip_flip': True, 'remove_ambiguous': True, 'remove_multiallelic': False, 'keep_first_match': False} + labelled: pl.DataFrame = (get_all_matches(scorefile=scorefile, target=target) + .pipe(label_matches, params=params) + .collect()) logger.debug(labelled.select(['ID', 'match_type', 'best_match', 'ambiguous', 'match_status', 'exclude'])) - assert labelled['best_match'].to_list() == [True, True, True] - assert labelled['ambiguous'].to_list() == [False, True, False] - assert labelled['exclude'].to_list() == [False, True, False] - assert labelled['match_status'].to_list() == ["matched", "excluded", "matched"] + assert labelled['best_match'].to_list() == [True, True, True, False] + assert labelled['ambiguous'].to_list() == [False, True, False, True] + assert labelled['exclude'].to_list() == [False, True, False, True] + assert labelled['match_status'].to_list() == ["matched", "excluded", "matched", "not_best"] def test_ambiguous_label(small_flipped_scorefile, small_target): """ Test ambiguous variant labels change when they're kept for match candidates with one match per position """ scorefile, target = _cast_cat(small_flipped_scorefile, small_target) - - no_ambiguous = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=True, keep_first_match=False).collect() - - assert no_ambiguous['best_match'].to_list() == [True] - assert no_ambiguous['ambiguous'].to_list() == [True] - assert no_ambiguous['exclude'].to_list() == [True] - assert no_ambiguous['match_status'].to_list() == ["excluded"] + no_flip = {'skip_flip': True, 'remove_ambiguous': True, 'remove_multiallelic': False, 'keep_first_match': False} + no_ambiguous: pl.DataFrame = (get_all_matches(scorefile=scorefile, target=target) + .pipe(label_matches, params=no_flip) + .collect()) + + # 2:2:T:A -> refalt -> ambiguous -> excluded (best match but ambiguous) + # 1:1:A:C -> refalt_flip -> not ambiguous -> excluded (best match but skip_flip) + # 2:2:T:A -> refalt_flip -> ambiguous -> not_best (refalt priority so not best and excluded) + # 3:3:T:G -> refalt_flip -> not ambiguous -> excluded (best match but skip_flip) + assert no_ambiguous['best_match'].to_list() == [True, True, False, True] + assert no_ambiguous['ambiguous'].to_list() == [True, False, True, False] + assert no_ambiguous['exclude'].to_list() == [True, True, True, True] + assert no_ambiguous['match_status'].to_list() == ["excluded", "excluded", "not_best", "excluded"] # otherwise, ambiguous variants are kept - labelled = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False).collect() - - assert labelled['best_match'].to_list() == [True] - assert labelled['ambiguous'].to_list() == [True] - assert labelled['exclude'].to_list() == [False] - assert labelled['match_status'].to_list() == ["matched"] - - -def test_duplicate_best_match(duplicated_matches, request): + flip_params = {'skip_flip': True, 'remove_ambiguous': False, 'remove_multiallelic': False, + 'keep_first_match': False} + labelled = (get_all_matches(scorefile=scorefile, target=target) + .pipe(label_matches, params=flip_params) + .collect()) + + # 2:2:T:A -> refalt -> ambiguous -> matched + # 1:1:A:C -> refalt_flip -> not ambiguous -> excluded (best match but skip_flip) + # 2:2:T:A -> refalt_flip -> ambiguous -> not_best (refalt priority so not best and excluded) + # 3:3:T:G -> refalt_flip -> not ambiguous -> excluded (best match but skip_flip) + assert labelled['best_match'].to_list() == [True, True, False, True] + assert labelled['ambiguous'].to_list() == [True, False, True, False] + assert labelled['exclude'].to_list() == [False, True, True, True] + assert labelled['match_status'].to_list() == ["matched", "excluded", "not_best", "excluded"] + + +def test_duplicate_ID(duplicated_matches, request): # these matches come from different lines in the original scoring file assert duplicated_matches["row_nr"].to_list() == [1, 4] # but they have the same ID! @@ -94,7 +112,7 @@ def test_duplicate_best_match(duplicate_best_match): @pytest.fixture(params=[True, False], ids=["keep_first_match", "delete_both"]) -def duplicated_matches(small_scorefile, small_target, request): +def duplicated_matches(small_scorefile, small_target, request) -> pl.DataFrame: # pgs catalog scorefiles can contain the same variant remapped to multiple rows # this happens after liftover to a different genome build # row_nrs will be different, but other information may be the same @@ -105,21 +123,33 @@ def duplicated_matches(small_scorefile, small_target, request): scorefile, target = _cast_cat(dups, small_target) - return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=request.param).collect() + params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, + 'keep_first_match': request.param} + return (get_all_matches(scorefile=scorefile, target=target) + .pipe(label_matches, params=params) + .collect()) @pytest.fixture -def multiple_match_types(small_target, small_scorefile): +def multiple_match_types(small_target, small_scorefile) -> pl.DataFrame: # skip flip will return two candidate matches for one target position: refalt + refalt_flip scorefile, target = _cast_cat(small_scorefile, small_target) - return (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('chr_name') == '2')).collect() + + params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False} + return (get_all_matches(scorefile=scorefile, target=target) + .pipe(label_matches, params=params) + .filter(pl.col('chr_name') == '2') + .collect()) @pytest.fixture -def duplicate_best_match(small_target, small_scorefile_no_oa): +def duplicate_best_match(small_target, small_scorefile_no_oa) -> pl.DataFrame: # this type of target genome can sometimes occur when the REF is different at the same position odd_target = {'#CHROM': [1, 1], 'POS': [1, 1], 'REF': ['T', 'C'], 'ALT': ['A', 'A'], 'ID': ['1:1:T:C', '1:1:A:A'], 'is_multiallelic': [False, False]} scorefile, target = _cast_cat(small_scorefile_no_oa, pl.DataFrame(odd_target)) - return get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False).collect() + + params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False} + return (get_all_matches(scorefile=scorefile, target=target) + .pipe(label_matches, params=params) + .collect()) diff --git a/tests/match/test_match.py b/tests/match/test_match.py index b8fbb07..ca509d6 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -5,6 +5,7 @@ import polars as pl import pytest +from pgscatalog_utils.match.label import label_matches from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.match_variants import match_variants @@ -60,15 +61,21 @@ def _cast_cat(scorefile, target) -> tuple[pl.LazyFrame, pl.LazyFrame]: def test_match_strategies(small_scorefile, small_target): scorefile, target = _cast_cat(small_scorefile, small_target) + params = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} # check unambiguous matches - df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == False)).collect() + df: pl.DataFrame = (get_all_matches(scorefile, target) + .pipe(label_matches, params=params) + .filter(pl.col('ambiguous') == False) + .collect()) assert set(df['ID'].to_list()).issubset({'3:3:T:G', '1:1:A:C'}) assert set(df['match_type'].to_list()).issubset(['altref', 'refalt']) # when keeping ambiguous and flipping alleles - flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == True)).collect() + flip_params = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} + flip: pl.DataFrame = (get_all_matches(scorefile, target) + .pipe(label_matches, params=flip_params) + .filter(pl.col('ambiguous') == True) + .collect()) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'altref', 'refalt_flip'}) @@ -77,28 +84,42 @@ def test_match_strategies(small_scorefile, small_target): def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) - df = (get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == False)).collect() + no_ambig = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} + df: pl.DataFrame = (get_all_matches(scorefile, target) + .pipe(label_matches, params=no_ambig) + .filter(pl.col('ambiguous') == False) + .collect()) assert set(df['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) assert set(df['match_type'].to_list()).issubset(['no_oa_alt', 'no_oa_ref']) # check ambiguous matches - flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == True)).collect() + ambig = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} + flip: pl.DataFrame = (get_all_matches(scorefile, target) + .pipe(label_matches, ambig) + .filter(pl.col('ambiguous') == True) + .collect()) assert set(flip['ID'].to_list()).issubset({'2:2:T:A'}) assert set(flip['match_type'].to_list()).issubset({'no_oa_alt', 'no_oa_ref_flip'}) def test_flip_match(small_flipped_scorefile, small_target): scorefile, target = _cast_cat(small_flipped_scorefile, small_target) - - df = get_all_matches(scorefile, target, skip_flip=True, remove_ambiguous=False, keep_first_match=False).collect() - assert set(df['ambiguous']) == {True} - assert set(df['match_type']) == {'refalt'} - - flip = (get_all_matches(scorefile, target, skip_flip=False, remove_ambiguous=False, keep_first_match=False) - .filter(pl.col('ambiguous') == False)).collect() + params = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} + df: pl.DataFrame = (get_all_matches(scorefile, target) + .pipe(label_matches, params=params) + .collect()) + + assert df['ambiguous'].to_list() == [True, False, True, False] + assert df['match_type'].to_list() == ['refalt', 'refalt_flip', 'altref_flip', 'altref_flip'] + assert df['match_status'].to_list() == ['matched', 'excluded', 'not_best', 'excluded'] # flipped -> excluded + + no_flip_params = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, + 'remove_multiallelic': False} + flip: pl.DataFrame = (get_all_matches(scorefile, target) + .pipe(label_matches, params=no_flip_params) + .filter(pl.col('ambiguous') == False) + .collect()) assert flip['match_type'].str.contains('flip').all() assert set(flip['ID'].to_list()).issubset(['3:3:T:G', '1:1:A:C']) From d5cfcf0fae63e34dd2fa799ab678fb5c2535be6b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 10 Oct 2022 16:34:08 +0100 Subject: [PATCH 43/46] add sort by match type --- pgscatalog_utils/match/log.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 6143308..978049c 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -51,7 +51,7 @@ def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame: "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"] pretty_df = (df.select(keep_cols) .select(pl.exclude("^.*_right")) - .sort(["accession", "row_nr", "chr_name", "chr_position"])) + .sort(["accession", "row_nr", "chr_name", "chr_position", "match_type"])) return pretty_df From 126f153ff2b9fa14e4248482460988721d0a7537 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 10 Oct 2022 16:34:17 +0100 Subject: [PATCH 44/46] fix _match_multiple_targets --- pgscatalog_utils/match/match_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 1fc322d..380e71c 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -110,7 +110,7 @@ def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, ar _check_target_chroms(target) matches.append(get_all_matches(scorefile=scorefile, target=target, low_memory=low_memory)) return (pl.concat(matches) - .pipe(label_params=params)) + .pipe(label_matches, params=params)) def _description_text() -> str: From 6aee17056bc93d0460ecc2a129e5223ca474a24e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 10 Oct 2022 16:38:06 +0100 Subject: [PATCH 45/46] oops --- pgscatalog_utils/match/log.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 978049c..5b74517 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -51,7 +51,7 @@ def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame: "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"] pretty_df = (df.select(keep_cols) .select(pl.exclude("^.*_right")) - .sort(["accession", "row_nr", "chr_name", "chr_position", "match_type"])) + .sort(["accession", "row_nr", "chr_name", "chr_position", "match_status"])) return pretty_df From 4a213936e4c9c4ed91a54e2b19584f1d9f0f967b Mon Sep 17 00:00:00 2001 From: Sam Lambert Date: Tue, 11 Oct 2022 11:34:31 +0100 Subject: [PATCH 46/46] Update pyproject.toml Add Laurent --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ef69c8f..18de317 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "pgscatalog_utils" version = "0.2.0" description = "Utilities for working with PGS Catalog API and scoring files" homepage = "https://github.com/PGScatalog/pgscatalog_utils" -authors = ["Benjamin Wingfield ", "Samuel Lambert "] +authors = ["Benjamin Wingfield ", "Samuel Lambert ", "Laurent Gil "] license = "Apache-2.0" readme = "README.md"