From 026c07ffeaf663e39f29db7b456fe83619f6b67d Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 18 Oct 2022 15:39:09 +0100 Subject: [PATCH 01/54] split matches by chromosome and aggregate them --- .../aggregate/aggregate_matches.py | 68 +++++++++++++++++++ pgscatalog_utils/match/label.py | 14 +++- pgscatalog_utils/match/log.py | 41 +++-------- pgscatalog_utils/match/match_variants.py | 22 ++---- pgscatalog_utils/match/read.py | 16 +++-- pgscatalog_utils/match/write.py | 26 +++++-- 6 files changed, 131 insertions(+), 56 deletions(-) create mode 100644 pgscatalog_utils/aggregate/aggregate_matches.py diff --git a/pgscatalog_utils/aggregate/aggregate_matches.py b/pgscatalog_utils/aggregate/aggregate_matches.py new file mode 100644 index 0000000..6d277ee --- /dev/null +++ b/pgscatalog_utils/aggregate/aggregate_matches.py @@ -0,0 +1,68 @@ +import argparse +import logging +import os + +import polars as pl + +from pgscatalog_utils import config + +from pgscatalog_utils.match.filter import filter_scores +from pgscatalog_utils.match.log import make_summary_log +from pgscatalog_utils.match.read import read_scorefile +from pgscatalog_utils.match.write import write_log, write_out + +logger = logging.getLogger(__name__) + + +def aggregate_matches(): + args = _parse_args() + config.set_logging_level(args.verbose) + + config.POLARS_MAX_THREADS = args.n_threads + os.environ['POLARS_MAX_THREADS'] = str(config.POLARS_MAX_THREADS) + # now the environment variable, parsed argument args.n_threads, and threadpool should agree + logger.debug(f"Setting POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}") + logger.debug(f"Using {config.POLARS_MAX_THREADS} threads to read CSVs") + logger.debug(f"polars threadpool size: {pl.threadpool_size()}") + + with pl.StringCache(): + scorefile = read_scorefile(path=args.scorefile, chrom=None) # read all variants + + logs = pl.scan_ipc(args.logs) # lazily read ipc to preserve dtypes + valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=logs, dataset="test", + min_overlap=args.min_overlap) + + if valid_matches.fetch().is_empty(): # this can happen if args.min_overlap = 0 + logger.error("Error: no target variants match any variants in scoring files") + raise Exception + + summary_log = make_summary_log(best_matches=valid_matches, filter_summary=filter_summary) + + dataset = args.dataset.replace('_', '-') + write_log(df=logs, prefix=dataset, chrom=None, outdir=args.outdir, file_format="csv") + summary_log.collect().write_csv(f"{dataset}_summary.csv") + write_out(valid_matches, args.split, args.outdir, dataset) + + +def _parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--dataset', dest='dataset', required=True, + help=' Label for target genomic dataset') + parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True, + type=float, help=' Minimum proportion of variants to match before error') + parser.add_argument('-s', '--scorefile', dest='scorefile', required=True, + help=' Path to scorefile') + parser.add_argument('--split', dest='split', default=True, action='store_true', + help=' Split scorefile per chromosome?') + parser.add_argument('-l', '--logs', dest='logs', required=True, + help=' Glob of log files including quotation marks e.g. "*.ipc"') + parser.add_argument('--outdir', dest='outdir', required=True, + help=' Output directory') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help=' Extra logging information') + parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) + return parser.parse_args(args) + + +if __name__ == "__main__": + aggregate_matches() diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 1c55ba3..2a97234 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -25,7 +25,19 @@ def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame: .pipe(_label_flips, params['skip_flip']) .with_column(pl.lit(True).alias('match_candidate'))) - return _encode_match_priority(labelled) + return _encode_match_priority(labelled).pipe(_reset_coltypes) + + +def _reset_coltypes(df: pl.LazyFrame): + """ Set up categorical columns after labelling is finished """ + return df.with_columns([ + pl.col('effect_allele').cast(pl.Categorical), + pl.col('effect_allele_FLIP').cast(pl.Categorical), + pl.col('other_allele_FLIP').cast(pl.Categorical), + pl.col('REF').cast(pl.Categorical), + pl.col('ALT').cast(pl.Categorical), + pl.col('match_type').cast(pl.Categorical) + ]) def _encode_match_priority(df: pl.LazyFrame) -> pl.LazyFrame: diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 5b74517..6718817 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -5,25 +5,12 @@ logger = logging.getLogger(__name__) -def make_logs(scorefile: pl.LazyFrame, match_candidates: pl.LazyFrame, filter_summary: pl.LazyFrame, dataset: str): +def make_logs(scorefile: pl.LazyFrame, match_candidates: pl.LazyFrame, dataset: str): # summary log -> aggregated from best matches (one per scoring file line) # big log -> unaggregated, written to compressed gzip, possibly multiple matches per scoring file line - summary_log, big_log = _join_match_candidates(scorefile=scorefile, matches=match_candidates, - filter_summary=filter_summary, - dataset=dataset) + big_log = _join_match_candidates(scorefile=scorefile, matches=match_candidates, dataset=dataset) - # make sure the aggregated best log matches the scoring file accession line count - summary_count: pl.LazyFrame = (summary_log.groupby(pl.col('accession')) - .agg(pl.sum('count'))) - log_count: pl.DataFrame = (scorefile.groupby("accession") - .agg(pl.count()) - .join(summary_count, on='accession')).collect() - - assert (log_count.get_column('count') == log_count.get_column( - 'count_right')).all(), "Log doesn't match input scoring file" - logger.debug("Log matches input scoring file") - - return _prettify_log(big_log), _prettify_summary(summary_log) + return _prettify_log(big_log) def make_summary_log(best_matches: pl.LazyFrame, filter_summary: pl.LazyFrame) -> pl.LazyFrame: @@ -48,32 +35,22 @@ def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame: def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame: keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight", "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic", - "ambiguous", "duplicate_best_match", "duplicate_ID", "match_status", "dataset"] + "ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID", + "match_status", "dataset"] pretty_df = (df.select(keep_cols) .select(pl.exclude("^.*_right")) .sort(["accession", "row_nr", "chr_name", "chr_position", "match_status"])) return pretty_df -def _join_match_candidates(scorefile: pl.LazyFrame, matches: pl.LazyFrame, filter_summary: pl.LazyFrame, - dataset: str) -> tuple[pl.LazyFrame, pl.LazyFrame]: +def _join_match_candidates(scorefile: pl.LazyFrame, matches: pl.LazyFrame, dataset: str) -> pl.LazyFrame: """ Join match candidates against the original scoring file """ - logger.debug("Making big logs") - - # make the summary log using the best matched candidates only - summary_log = (scorefile.join(matches.filter(pl.col('best_match') == True), - on=['row_nr', 'accession'], - how='outer') # left join would make checking line count later pointless - .with_column(pl.lit(dataset).alias('dataset')) - .select(pl.exclude("^.*_right$")) - .with_column(pl.col('match_status').fill_null("unmatched")) - .pipe(make_summary_log, filter_summary)) - + logger.debug("Joining all match candidates against input scoring file") # make a raw log with all match candidates included raw_log = (scorefile.join(matches, on=['row_nr', 'accession'], how='outer') - .with_column(pl.lit(dataset).alias('dataset')) + .with_columns(pl.lit(dataset).alias('dataset').cast(pl.Categorical)) .select(pl.exclude("^.*_right$"))).with_column(pl.col('match_status').fill_null("unmatched")) - return summary_log, raw_log + return raw_log diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 380e71c..0abcad4 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -7,12 +7,11 @@ import polars as pl import pgscatalog_utils.config as config -from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.label import label_matches from pgscatalog_utils.match.log import make_logs from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.read import read_target, read_scorefile -from pgscatalog_utils.match.write import write_out, write_log +from pgscatalog_utils.match.write import write_log logger = logging.getLogger(__name__) @@ -29,7 +28,7 @@ def match_variants(): logger.debug(f"polars threadpool size: {pl.threadpool_size()}") with pl.StringCache(): - scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile) + scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile, chrom=args.chrom) target_paths = list(set(args.target)) n_target_files = len(target_paths) matches: pl.DataFrame @@ -65,18 +64,11 @@ def match_variants(): raise Exception dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator - valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=matches, dataset=dataset, - min_overlap=args.min_overlap) + big_log = make_logs(scorefile=scorefile, match_candidates=matches, dataset=dataset) - if valid_matches.fetch().is_empty(): # this can happen if args.min_overlap = 0 - logger.error("Error: no target variants match any variants in scoring files") - raise Exception + write_log(df=big_log, prefix=dataset, chrom=args.chrom, outdir=args.outdir, file_format="ipc") - big_log, summary_log = make_logs(scorefile, matches, filter_summary, args.dataset) - - write_log(big_log, prefix=dataset) - summary_log.collect().write_csv(f"{dataset}_summary.csv") - write_out(valid_matches, args.split, args.outdir, dataset) + # write_out(valid_matches, args.split, args.outdir, dataset) def _check_target_chroms(target: pl.LazyFrame) -> None: @@ -169,6 +161,8 @@ def _parse_args(args=None): help=' Combined scorefile path (output of read_scorefiles.py)') parser.add_argument('-t', '--target', dest='target', required=True, nargs='+', help=' A list of paths of target genomic variants (.bim format)') + parser.add_argument('-c', '--chrom', dest='chrom', required=False, type=str, + help=' Set which chromosome is in the target variant file to speed up matching ') parser.add_argument('-f', '--fast', dest='fast', action='store_true', help=' Enable faster matching at the cost of increased RAM usage') parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) @@ -176,8 +170,6 @@ def _parse_args(args=None): help=' Split scorefile per chromosome?') parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') - parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True, - type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false', help=''' Flag to force the program to keep variants with ambiguous alleles, (e.g. A/T and G/C SNPs), which are normally diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index e7417f1..2f4fcc1 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -1,5 +1,6 @@ import logging +import typing import polars as pl import pgscatalog_utils.config as config @@ -20,7 +21,7 @@ def read_target(paths: list[str], low_memory: bool) -> pl.LazyFrame: .with_column(pl.col('ALT').cast(pl.Categorical))).lazy() -def read_scorefile(path: str) -> pl.LazyFrame: +def read_scorefile(path: str, chrom: typing.Union[str, None]) -> pl.LazyFrame: logger.debug("Reading scorefile") dtypes = {'chr_name': pl.Categorical, 'chr_position': pl.UInt64, @@ -28,10 +29,17 @@ def read_scorefile(path: str) -> pl.LazyFrame: 'other_allele': pl.Utf8, 'effect_type': pl.Categorical, 'accession': pl.Categorical} - return (pl.read_csv(path, sep='\t', dtype=dtypes, n_threads=config.POLARS_MAX_THREADS) - .lazy() - .pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])).with_columns([ + ldf = pl.read_csv(path, sep = '\t', dtype=dtypes).lazy() + if chrom is not None: + logger.debug(f"--chrom set, filtering scoring file to chromosome {chrom}") + ldf = ldf.filter(pl.col('chr_name') == chrom) # add filter to query plan + else: + logger.debug("--chrom parameter not set, using all variants in scoring file") + + return (ldf.pipe(complement_valid_alleles, flip_cols=['effect_allele', 'other_allele'])).with_columns([ pl.col("effect_allele").cast(pl.Categorical), pl.col("other_allele").cast(pl.Categorical), pl.col("effect_allele_FLIP").cast(pl.Categorical), pl.col("other_allele_FLIP").cast(pl.Categorical)]) + + diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 9d4ba92..b14d60c 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -1,16 +1,34 @@ import gzip import logging import os +import typing import polars as pl logger = logging.getLogger(__name__) -def write_log(df: pl.LazyFrame, prefix: str) -> None: - logger.debug(f"Compressing and writing log: {prefix}_log.csv.gz") - with gzip.open(f"{prefix}_log.csv.gz", 'wb') as f: - df.collect().write_csv(f) +def write_log(df: pl.LazyFrame, prefix: str, chrom: typing.Union[str, None], file_format: str, outdir: str) -> None: + # feather file preserves dtypes and is small + # don't compress the feather file to allow memory mapping + if chrom is None: + log_name: str = os.path.join(os.path.abspath(outdir), f"{prefix}_log") + else: + log_name: str = os.path.join(os.path.abspath(outdir), f"{prefix}_chrom{chrom}_log") + + match file_format: + case 'ipc': + fout: str = ''.join([log_name, ".ipc.zst"]) + logger.debug(f"Writing {fout} in format: {file_format}") + df.collect().write_ipc(fout, compression='zstd') # gzip compression not supported + case 'csv': + fout: str = ''.join([log_name, ".csv.gz"]) + logger.debug(f"Writing {fout} in format: {file_format}") + with gzip.open(fout, 'wb') as f: + df.collect().write_csv(f) + case _: + logger.critical(f"Invalid format: {file_format}") + raise Exception def write_out(df: pl.LazyFrame, split: bool, outdir: str, dataset: str) -> None: From ed7d700e833f46aba7907ed12becba604b2d606a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 18 Oct 2022 15:57:16 +0100 Subject: [PATCH 02/54] bump version --- Dockerfile | 4 ++-- pgscatalog_utils/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0d42228..0549d2a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,8 @@ FROM python:3.10 WORKDIR /opt/ -COPY --from=builder /app/dist/pgscatalog_utils-0.2.0-py3-none-any.whl . +COPY --from=builder /app/dist/pgscatalog_utils-0.3.0-py3-none-any.whl . -RUN pip install pgscatalog_utils-0.2.0-py3-none-any.whl +RUN pip install pgscatalog_utils-0.3.0-py3-none-any.whl RUN apt-get update && apt-get install -y sqlite3 \ No newline at end of file diff --git a/pgscatalog_utils/__init__.py b/pgscatalog_utils/__init__.py index 7fd229a..0404d81 100644 --- a/pgscatalog_utils/__init__.py +++ b/pgscatalog_utils/__init__.py @@ -1 +1 @@ -__version__ = '0.2.0' +__version__ = '0.3.0' diff --git a/pyproject.toml b/pyproject.toml index 18de317..aad3498 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pgscatalog_utils" -version = "0.2.0" +version = "0.3.0" description = "Utilities for working with PGS Catalog API and scoring files" homepage = "https://github.com/PGScatalog/pgscatalog_utils" authors = ["Benjamin Wingfield ", "Samuel Lambert ", "Laurent Gil "] From dc748ff69f5a744c9d0d400720086e54331eb51e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 18 Oct 2022 17:45:33 +0100 Subject: [PATCH 03/54] fix summary log --- pgscatalog_utils/aggregate/aggregate_matches.py | 4 ++-- pgscatalog_utils/match/log.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pgscatalog_utils/aggregate/aggregate_matches.py b/pgscatalog_utils/aggregate/aggregate_matches.py index 6d277ee..3636342 100644 --- a/pgscatalog_utils/aggregate/aggregate_matches.py +++ b/pgscatalog_utils/aggregate/aggregate_matches.py @@ -36,9 +36,9 @@ def aggregate_matches(): logger.error("Error: no target variants match any variants in scoring files") raise Exception - summary_log = make_summary_log(best_matches=valid_matches, filter_summary=filter_summary) - dataset = args.dataset.replace('_', '-') + summary_log = make_summary_log(best_matches=valid_matches, filter_summary=filter_summary, dataset=dataset, + scorefile=scorefile) write_log(df=logs, prefix=dataset, chrom=None, outdir=args.outdir, file_format="csv") summary_log.collect().write_csv(f"{dataset}_summary.csv") write_out(valid_matches, args.split, args.outdir, dataset) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index 6718817..ae095d5 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -13,14 +13,19 @@ def make_logs(scorefile: pl.LazyFrame, match_candidates: pl.LazyFrame, dataset: return _prettify_log(big_log) -def make_summary_log(best_matches: pl.LazyFrame, filter_summary: pl.LazyFrame) -> pl.LazyFrame: +def make_summary_log(best_matches: pl.LazyFrame, scorefile: pl.LazyFrame, filter_summary: pl.LazyFrame, + dataset: str) -> pl.LazyFrame: """ Make an aggregated table """ - logger.debug("Aggregating best match log into a summary table") - return (best_matches + logger.debug("Aggregating best matches into a summary table") + return (scorefile.join(best_matches, on=['row_nr', 'accession'], how='outer') + .select(pl.exclude("^.*_right$")) + .with_columns([pl.col('match_status').fill_null(value='unmatched'), + pl.lit(dataset).alias('dataset')]) # fill in unmatched variants .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'match_flipped', 'duplicate_best_match', 'duplicate_ID']) .agg(pl.count()) - .join(filter_summary, how='left', on='accession')) + .join(filter_summary, how='left', on='accession') + .pipe(_prettify_summary)) def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame: From a0b607218f7f8a39c2bf6ce6eb90cc6d4f9ce15b Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 18 Oct 2022 17:45:40 +0100 Subject: [PATCH 04/54] fix types --- pgscatalog_utils/match/filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py index c2d0364..695b5c2 100644 --- a/pgscatalog_utils/match/filter.py +++ b/pgscatalog_utils/match/filter.py @@ -26,13 +26,13 @@ def filter_scores(scorefile: pl.LazyFrame, matches: pl.LazyFrame, min_overlap: f scores.append(df.with_column(pl.col('accession').cast(pl.Categorical))) score_summary: pl.LazyFrame = pl.concat(scores).lazy() - filtered_scores: pl.DataFrame = (filtered_matches.join(score_summary, on='accession', how='left') + filtered_scores: pl.LazyFrame = (filtered_matches.join(score_summary, on='accession', how='left') .filter(pl.col('score_pass') == True)) return filtered_scores, score_summary -def _calculate_match_rate(df: pl.DataFrame) -> pl.DataFrame: +def _calculate_match_rate(df: pl.LazyFrame) -> pl.LazyFrame: logger.debug("Calculating overlap between target genome and scoring file") return (df.groupby('accession') .agg([pl.count(), (pl.col('match_type') == None).sum().alias('no_match')]) From 66b1b8a6577ed759b232c7056f5e0a83fb2a9e29 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 18 Oct 2022 17:46:00 +0100 Subject: [PATCH 05/54] add aggregate_matches --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index aad3498..5f52eac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ readme = "README.md" combine_scorefiles = "pgscatalog_utils.scorefile.combine_scorefiles:combine_scorefiles" download_scorefiles = "pgscatalog_utils.download.download_scorefile:download_scorefile" match_variants = "pgscatalog_utils.match.match_variants:match_variants" +aggregate_matches = "pgscatalog_utils.aggregate.aggregate_matches:aggregate_matches" aggregate_scores = "pgscatalog_utils.aggregate.aggregate_scores:aggregate_scores" validate_scorefiles = "pgscatalog_utils.validate.validate_scorefile:validate_scorefile" From 67c13c04e346cc1cd7e67fffb58c04d4e7bd2324 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 19 Oct 2022 11:37:41 +0100 Subject: [PATCH 06/54] fix summary log aggregation --- pgscatalog_utils/aggregate/aggregate_matches.py | 8 +++++--- pgscatalog_utils/match/log.py | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/aggregate/aggregate_matches.py b/pgscatalog_utils/aggregate/aggregate_matches.py index 3636342..2e6bb28 100644 --- a/pgscatalog_utils/aggregate/aggregate_matches.py +++ b/pgscatalog_utils/aggregate/aggregate_matches.py @@ -7,7 +7,7 @@ from pgscatalog_utils import config from pgscatalog_utils.match.filter import filter_scores -from pgscatalog_utils.match.log import make_summary_log +from pgscatalog_utils.match.log import make_summary_log, check_log_count from pgscatalog_utils.match.read import read_scorefile from pgscatalog_utils.match.write import write_log, write_out @@ -37,8 +37,10 @@ def aggregate_matches(): raise Exception dataset = args.dataset.replace('_', '-') - summary_log = make_summary_log(best_matches=valid_matches, filter_summary=filter_summary, dataset=dataset, - scorefile=scorefile) + summary_log: pl.LazyFrame = make_summary_log(match_candidates=logs, filter_summary=filter_summary, dataset=dataset, + scorefile=scorefile) + check_log_count(summary_log=summary_log, scorefile=scorefile) + write_log(df=logs, prefix=dataset, chrom=None, outdir=args.outdir, file_format="csv") summary_log.collect().write_csv(f"{dataset}_summary.csv") write_out(valid_matches, args.split, args.outdir, dataset) diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py index ae095d5..a456d07 100644 --- a/pgscatalog_utils/match/log.py +++ b/pgscatalog_utils/match/log.py @@ -13,10 +13,11 @@ def make_logs(scorefile: pl.LazyFrame, match_candidates: pl.LazyFrame, dataset: return _prettify_log(big_log) -def make_summary_log(best_matches: pl.LazyFrame, scorefile: pl.LazyFrame, filter_summary: pl.LazyFrame, +def make_summary_log(match_candidates: pl.LazyFrame, scorefile: pl.LazyFrame, filter_summary: pl.LazyFrame, dataset: str) -> pl.LazyFrame: """ Make an aggregated table """ logger.debug("Aggregating best matches into a summary table") + best_matches: pl.LazyFrame = match_candidates.filter(pl.col('best_match') == True) return (scorefile.join(best_matches, on=['row_nr', 'accession'], how='outer') .select(pl.exclude("^.*_right$")) .with_columns([pl.col('match_status').fill_null(value='unmatched'), @@ -28,6 +29,20 @@ def make_summary_log(best_matches: pl.LazyFrame, scorefile: pl.LazyFrame, filter .pipe(_prettify_summary)) +def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame) -> None: + """ Check aggregated counts vs original from scoring file """ + summary_count: pl.DataFrame = (summary_log.groupby(pl.col('accession')) + .agg(pl.sum('count'))).collect() + log_count: pl.DataFrame = (scorefile.groupby("accession") + .agg(pl.count()) + .collect() + .join(summary_count, on='accession')) + + assert (log_count.get_column('count') == log_count.get_column( + 'count_right')).all(), "Log doesn't match input scoring file" + logger.debug("Log matches input scoring file") + + def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame: keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic", "duplicate_best_match", "duplicate_ID", "count", "percent"] From b22a0081801f018ec47d408325f7daae02226f3a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 19 Oct 2022 14:05:14 +0100 Subject: [PATCH 07/54] fix tests and rename -> combine_matches --- conftest.py | 2 ++ .../combine_matches.py} | 4 +-- pgscatalog_utils/match/label.py | 14 +-------- pgscatalog_utils/match/read.py | 1 - pyproject.toml | 2 +- tests/match/test_label.py | 7 +++-- tests/match/test_match.py | 30 ++++--------------- 7 files changed, 16 insertions(+), 44 deletions(-) rename pgscatalog_utils/{aggregate/aggregate_matches.py => match/combine_matches.py} (98%) diff --git a/conftest.py b/conftest.py index 46631c7..0f087d1 100644 --- a/conftest.py +++ b/conftest.py @@ -12,6 +12,8 @@ from pgscatalog_utils.match.preprocess import complement_valid_alleles from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles +pl.toggle_string_cache(True) + @pytest.fixture(scope="session") def pgs_accessions(): diff --git a/pgscatalog_utils/aggregate/aggregate_matches.py b/pgscatalog_utils/match/combine_matches.py similarity index 98% rename from pgscatalog_utils/aggregate/aggregate_matches.py rename to pgscatalog_utils/match/combine_matches.py index 2e6bb28..7c57e27 100644 --- a/pgscatalog_utils/aggregate/aggregate_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) -def aggregate_matches(): +def combine_matches(): args = _parse_args() config.set_logging_level(args.verbose) @@ -67,4 +67,4 @@ def _parse_args(args=None): if __name__ == "__main__": - aggregate_matches() + combine_matches() diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 2a97234..1c55ba3 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -25,19 +25,7 @@ def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame: .pipe(_label_flips, params['skip_flip']) .with_column(pl.lit(True).alias('match_candidate'))) - return _encode_match_priority(labelled).pipe(_reset_coltypes) - - -def _reset_coltypes(df: pl.LazyFrame): - """ Set up categorical columns after labelling is finished """ - return df.with_columns([ - pl.col('effect_allele').cast(pl.Categorical), - pl.col('effect_allele_FLIP').cast(pl.Categorical), - pl.col('other_allele_FLIP').cast(pl.Categorical), - pl.col('REF').cast(pl.Categorical), - pl.col('ALT').cast(pl.Categorical), - pl.col('match_type').cast(pl.Categorical) - ]) + return _encode_match_priority(labelled) def _encode_match_priority(df: pl.LazyFrame) -> pl.LazyFrame: diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index 2f4fcc1..b98ff2e 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -3,7 +3,6 @@ import typing import polars as pl -import pgscatalog_utils.config as config from pgscatalog_utils.match.preprocess import annotate_multiallelic, complement_valid_alleles, filter_target from pgscatalog_utils.target import Target diff --git a/pyproject.toml b/pyproject.toml index 5f52eac..a1e2d19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ readme = "README.md" combine_scorefiles = "pgscatalog_utils.scorefile.combine_scorefiles:combine_scorefiles" download_scorefiles = "pgscatalog_utils.download.download_scorefile:download_scorefile" match_variants = "pgscatalog_utils.match.match_variants:match_variants" -aggregate_matches = "pgscatalog_utils.aggregate.aggregate_matches:aggregate_matches" +combine_matches = "pgscatalog_utils.match.combine_matches:combine_matches" aggregate_scores = "pgscatalog_utils.aggregate.aggregate_scores:aggregate_scores" validate_scorefiles = "pgscatalog_utils.validate.validate_scorefile:validate_scorefile" diff --git a/tests/match/test_label.py b/tests/match/test_label.py index ebe0c43..e3e531f 100644 --- a/tests/match/test_label.py +++ b/tests/match/test_label.py @@ -125,7 +125,7 @@ def duplicated_matches(small_scorefile, small_target, request) -> pl.DataFrame: params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': request.param} - return (get_all_matches(scorefile=scorefile, target=target) + return (get_all_matches(scorefile=scorefile, target=target, low_memory=False) .pipe(label_matches, params=params) .collect()) @@ -136,7 +136,7 @@ def multiple_match_types(small_target, small_scorefile) -> pl.DataFrame: scorefile, target = _cast_cat(small_scorefile, small_target) params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False} - return (get_all_matches(scorefile=scorefile, target=target) + return (get_all_matches(scorefile=scorefile, target=target, low_memory=False) .pipe(label_matches, params=params) .filter(pl.col('chr_name') == '2') .collect()) @@ -147,9 +147,10 @@ def duplicate_best_match(small_target, small_scorefile_no_oa) -> pl.DataFrame: # this type of target genome can sometimes occur when the REF is different at the same position odd_target = {'#CHROM': [1, 1], 'POS': [1, 1], 'REF': ['T', 'C'], 'ALT': ['A', 'A'], 'ID': ['1:1:T:C', '1:1:A:A'], 'is_multiallelic': [False, False]} + scorefile, target = _cast_cat(small_scorefile_no_oa, pl.DataFrame(odd_target)) params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False} - return (get_all_matches(scorefile=scorefile, target=target) + return (get_all_matches(scorefile=scorefile, target=target, low_memory=False) .pipe(label_matches, params=params) .collect()) diff --git a/tests/match/test_match.py b/tests/match/test_match.py index ca509d6..5252784 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -3,34 +3,16 @@ from unittest.mock import patch import polars as pl -import pytest - from pgscatalog_utils.match.label import label_matches from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.match_variants import match_variants -def test_match_fail(combined_scorefile, target_path, tmp_path): - out_dir = str(tmp_path.resolve()) - - args: list[str] = ['match_variants', '-s', combined_scorefile, - '-t', target_path, - '-m', 1, - '-d', 'test', - '--outdir', out_dir, - '--keep_ambiguous', '--keep_multiallelic'] - - with pytest.raises(Exception): - with patch('sys.argv', args): - match_variants() - - def test_match_pass(mini_scorefile, target_path, tmp_path): out_dir = str(tmp_path.resolve()) args: list[str] = ['match_variants', '-s', mini_scorefile, '-t', target_path, - '-m', 0, '-d', 'test', '--outdir', out_dir, '--keep_ambiguous', '--keep_multiallelic'] @@ -63,7 +45,7 @@ def test_match_strategies(small_scorefile, small_target): params = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} # check unambiguous matches - df: pl.DataFrame = (get_all_matches(scorefile, target) + df: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) .pipe(label_matches, params=params) .filter(pl.col('ambiguous') == False) .collect()) @@ -72,7 +54,7 @@ def test_match_strategies(small_scorefile, small_target): # when keeping ambiguous and flipping alleles flip_params = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - flip: pl.DataFrame = (get_all_matches(scorefile, target) + flip: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) .pipe(label_matches, params=flip_params) .filter(pl.col('ambiguous') == True) .collect()) @@ -85,7 +67,7 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) no_ambig = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - df: pl.DataFrame = (get_all_matches(scorefile, target) + df: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) .pipe(label_matches, params=no_ambig) .filter(pl.col('ambiguous') == False) .collect()) @@ -95,7 +77,7 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): # check ambiguous matches ambig = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - flip: pl.DataFrame = (get_all_matches(scorefile, target) + flip: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) .pipe(label_matches, ambig) .filter(pl.col('ambiguous') == True) .collect()) @@ -106,7 +88,7 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): def test_flip_match(small_flipped_scorefile, small_target): scorefile, target = _cast_cat(small_flipped_scorefile, small_target) params = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - df: pl.DataFrame = (get_all_matches(scorefile, target) + df: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) .pipe(label_matches, params=params) .collect()) @@ -116,7 +98,7 @@ def test_flip_match(small_flipped_scorefile, small_target): no_flip_params = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - flip: pl.DataFrame = (get_all_matches(scorefile, target) + flip: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) .pipe(label_matches, params=no_flip_params) .filter(pl.col('ambiguous') == False) .collect()) From c7011c7ef66ad0716cf6731c4e685d8d0e195080 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 20 Oct 2022 16:16:19 +0100 Subject: [PATCH 08/54] fix invalid pgs id --- pgscatalog_utils/download/score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/download/score.py b/pgscatalog_utils/download/score.py index 3c2bf29..e5eefb7 100644 --- a/pgscatalog_utils/download/score.py +++ b/pgscatalog_utils/download/score.py @@ -18,7 +18,7 @@ def get_url(pgs: list[str], build: str, user_agent:str = None) -> dict[str, str] response = _parse_json_query(query_score(chunk,user_agent), build) pgs_result = pgs_result + list(response.keys()) url_result = url_result + list(response.values()) - except TypeError: + except (AttributeError, TypeError): logger.error(f"Bad response from PGS Catalog API. Is {pgs} a valid ID?") sys.exit(1) From eafd0fb54d6c15cdc26a4fed30bb9408ee310987 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 20 Oct 2022 16:16:42 +0100 Subject: [PATCH 09/54] keep match_variants functionality (optional) --- pgscatalog_utils/match/combine_matches.py | 33 +++-------- pgscatalog_utils/match/match_variants.py | 70 +++++++++++++++++++++-- 2 files changed, 72 insertions(+), 31 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 7c57e27..7981c9f 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -5,11 +5,8 @@ import polars as pl from pgscatalog_utils import config - -from pgscatalog_utils.match.filter import filter_scores -from pgscatalog_utils.match.log import make_summary_log, check_log_count +from pgscatalog_utils.match.match_variants import log_and_write from pgscatalog_utils.match.read import read_scorefile -from pgscatalog_utils.match.write import write_log, write_out logger = logging.getLogger(__name__) @@ -26,38 +23,24 @@ def combine_matches(): logger.debug(f"polars threadpool size: {pl.threadpool_size()}") with pl.StringCache(): - scorefile = read_scorefile(path=args.scorefile, chrom=None) # read all variants - - logs = pl.scan_ipc(args.logs) # lazily read ipc to preserve dtypes - valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=logs, dataset="test", - min_overlap=args.min_overlap) - - if valid_matches.fetch().is_empty(): # this can happen if args.min_overlap = 0 - logger.error("Error: no target variants match any variants in scoring files") - raise Exception - - dataset = args.dataset.replace('_', '-') - summary_log: pl.LazyFrame = make_summary_log(match_candidates=logs, filter_summary=filter_summary, dataset=dataset, - scorefile=scorefile) - check_log_count(summary_log=summary_log, scorefile=scorefile) - - write_log(df=logs, prefix=dataset, chrom=None, outdir=args.outdir, file_format="csv") - summary_log.collect().write_csv(f"{dataset}_summary.csv") - write_out(valid_matches, args.split, args.outdir, dataset) + scorefile = read_scorefile(path=args.scorefile, chrom=None) # chrom=None to read all variants + matches = pl.scan_ipc(args.matches) # lazily read ipc to preserve dtypes + dataset = args.dataset.replace('_', '-') # _ used as delimiter in pgsc_calc + log_and_write(matches=matches, scorefile=scorefile, dataset=dataset, args=args) def _parse_args(args=None): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataset', dest='dataset', required=True, help=' Label for target genomic dataset') - parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True, + parser.add_argument('--min_overlap', dest='min_overlap', required=True, type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('-s', '--scorefile', dest='scorefile', required=True, help=' Path to scorefile') parser.add_argument('--split', dest='split', default=True, action='store_true', help=' Split scorefile per chromosome?') - parser.add_argument('-l', '--logs', dest='logs', required=True, - help=' Glob of log files including quotation marks e.g. "*.ipc"') + parser.add_argument('-m', '--matches', dest='matches', required=True, + help=' Glob of match files including quotation marks e.g. "*.ipc"') parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 0abcad4..bce8a86 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -7,11 +7,12 @@ import polars as pl import pgscatalog_utils.config as config +from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.label import label_matches -from pgscatalog_utils.match.log import make_logs +from pgscatalog_utils.match.log import make_logs, make_summary_log, check_log_count from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.read import read_target, read_scorefile -from pgscatalog_utils.match.write import write_log +from pgscatalog_utils.match.write import write_log, write_out logger = logging.getLogger(__name__) @@ -64,11 +65,37 @@ def match_variants(): raise Exception dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator - big_log = make_logs(scorefile=scorefile, match_candidates=matches, dataset=dataset) - write_log(df=big_log, prefix=dataset, chrom=args.chrom, outdir=args.outdir, file_format="ipc") + if args.only_match: + fout: str = f"{dataset}_{args.chrom}_matches.ipc.zst" + logger.debug(f"--only_match set, writing out match candidates {fout} and exiting") + matches.collect().write_ipc(fout, compression="zstd") + logger.debug("Intermediate files can be processed with combine_matches") + sys.exit(0) + else: + logger.debug("Picking best match candidates and making scoring files") + log_and_write(matches=matches, scorefile=scorefile, dataset=dataset, args=args) - # write_out(valid_matches, args.split, args.outdir, dataset) + +def log_and_write(matches: pl.LazyFrame, scorefile: pl.LazyFrame, dataset: str, args): + """ Make match logs and write """ + valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=matches, dataset=dataset, + min_overlap=args.min_overlap) + + if valid_matches.fetch().is_empty(): # this can happen if args.min_overlap = 0 + logger.critical("Error: no target variants match any variants in scoring files") + raise Exception("No valid matches found") + + big_log: pl.LazyFrame = make_logs(scorefile=scorefile, match_candidates=matches, dataset=dataset) + summary_log: pl.LazyFrame = make_summary_log(match_candidates=matches, filter_summary=filter_summary, + dataset=dataset, + scorefile=scorefile) + + check_log_count(summary_log=summary_log, scorefile=scorefile) + + write_log(df=big_log, prefix=dataset, chrom=None, outdir=args.outdir, file_format="csv") + summary_log.collect().write_csv(f"{dataset}_summary.csv") + write_out(valid_matches, args.split, args.outdir, dataset) def _check_target_chroms(target: pl.LazyFrame) -> None: @@ -165,6 +192,10 @@ def _parse_args(args=None): help=' Set which chromosome is in the target variant file to speed up matching ') parser.add_argument('-f', '--fast', dest='fast', action='store_true', help=' Enable faster matching at the cost of increased RAM usage') + parser.add_argument('--only_match', dest='only_match', action='store_true', + help=" Only match, then write intermediate files, don't make scoring files") + parser.add_argument('--min_overlap', dest='min_overlap', required=False, + type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) parser.add_argument('--split', dest='split', default=False, action='store_true', help=' Split scorefile per chromosome?') @@ -188,7 +219,34 @@ def _parse_args(args=None): keep the first match candidate (default: drop all candidates)''') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') - return parser.parse_args(args) + return _check_args(parser.parse_args(args)) + + +def _check_args(args): + if args.chrom is not None and not args.only_match: + # filtering the scoring file will break overlap assumptions and calculations + # e.g.: + # what if one chromosome matches well but another chromosome matches poorly? + # what if the poorly matching chromosome only has 5 variants to match? + # + # pgsc_calc uses global overlap % to decide if a score fails matching + # --only_match skips overlap calculations (done in combine_matches instead) + logger.critical("--chrom requires --only_match") + sys.exit(1) + if args.only_match and args.min_overlap is not None: + # can't calculate min_overlap properly if just checking matches + logger.critical("Invalid arguments: --only_match and --min_overlap (pick one!)") + sys.exit(1) + if not args.only_match and args.min_overlap is None: + # need to calculate min_overlap before making scoring files + logger.critical("Invalid arguments: set --min_overlap or --only_match") + sys.exit(1) + if args.split and args.only_match: + # not writing scoring files, so split output doesn't make sense + logger.critical("Invalid arguments: --only_match and --split (pick one!)") + sys.exit(1) + + return args def _make_params_dict(args) -> dict[str, bool]: From 4a608979185a1d3e81be2d0b75ffe1c361d39e85 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 20 Oct 2022 16:54:46 +0100 Subject: [PATCH 10/54] fix tests --- tests/match/test_match.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/match/test_match.py b/tests/match/test_match.py index 5252784..f96ffd2 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -3,6 +3,8 @@ from unittest.mock import patch import polars as pl +import pytest + from pgscatalog_utils.match.label import label_matches from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.match_variants import match_variants @@ -14,6 +16,7 @@ def test_match_pass(mini_scorefile, target_path, tmp_path): args: list[str] = ['match_variants', '-s', mini_scorefile, '-t', target_path, '-d', 'test', + '--min_overlap', 0, '--outdir', out_dir, '--keep_ambiguous', '--keep_multiallelic'] @@ -21,6 +24,21 @@ def test_match_pass(mini_scorefile, target_path, tmp_path): match_variants() +def test_match_fail(mini_scorefile, target_path, tmp_path): + out_dir = str(tmp_path.resolve()) + + args: list[str] = ['match_variants', '-s', mini_scorefile, + '-t', target_path, + '-d', 'test', + '--min_overlap', 1, + '--outdir', out_dir, + '--keep_ambiguous', '--keep_multiallelic'] + + with pytest.raises(Exception): + with patch('sys.argv', args): + match_variants() + + def _cast_cat(scorefile, target) -> tuple[pl.LazyFrame, pl.LazyFrame]: with pl.StringCache(): scorefile = scorefile.with_columns([ From a1c1ffbbaa35b54ab22fe495c536cc54d8c6ecb6 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 21 Oct 2022 11:46:29 +0100 Subject: [PATCH 11/54] clarify log message --- pgscatalog_utils/match/match_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index bce8a86..0f8224c 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -73,7 +73,7 @@ def match_variants(): logger.debug("Intermediate files can be processed with combine_matches") sys.exit(0) else: - logger.debug("Picking best match candidates and making scoring files") + logger.debug("Filtering match candidates and making scoring files") log_and_write(matches=matches, scorefile=scorefile, dataset=dataset, args=args) From abb5fd958dab09456ff707432bf0b7f1869a0555 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 21 Oct 2022 11:46:42 +0100 Subject: [PATCH 12/54] check for duplicate IDs in matches before writing scoring files --- pgscatalog_utils/match/combine_matches.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 7981c9f..3950952 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -24,9 +24,17 @@ def combine_matches(): with pl.StringCache(): scorefile = read_scorefile(path=args.scorefile, chrom=None) # chrom=None to read all variants - matches = pl.scan_ipc(args.matches) # lazily read ipc to preserve dtypes + matches = pl.concat(pl.collect_all([pl.scan_ipc(x, memory_map=False) for x in args.matches])) + + # make sure there's no duplicate variant_ids across pvars + # processing batched chromosomes with overlapping variants might cause problems + # e.g. chr1 1-100000, chr1 100001-500000 + n_matched = matches.filter(pl.col('match_status') == 'matched').shape[0] + n_unique = matches.filter(pl.col('match_status') == 'matched').select(pl.col('ID')).unique().shape[0] + assert n_matched == n_unique, "Duplicate IDs in final matches" + dataset = args.dataset.replace('_', '-') # _ used as delimiter in pgsc_calc - log_and_write(matches=matches, scorefile=scorefile, dataset=dataset, args=args) + log_and_write(matches=matches.lazy(), scorefile=scorefile, dataset=dataset, args=args) def _parse_args(args=None): @@ -39,8 +47,8 @@ def _parse_args(args=None): help=' Path to scorefile') parser.add_argument('--split', dest='split', default=True, action='store_true', help=' Split scorefile per chromosome?') - parser.add_argument('-m', '--matches', dest='matches', required=True, - help=' Glob of match files including quotation marks e.g. "*.ipc"') + parser.add_argument('-m', '--matches', dest='matches', required=True, nargs='+', + help=' List of match files') parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', From 2c7216e53ff41334c669f5fa63cf180e1dfbbf27 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 21 Oct 2022 13:39:30 +0100 Subject: [PATCH 13/54] Handle descendant EFO terms to be excluded from trait queries --- pgscatalog_utils/download/download_scorefile.py | 11 +++++++++-- pgscatalog_utils/download/trait.py | 9 ++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py index 6abd365..a475ada 100644 --- a/pgscatalog_utils/download/download_scorefile.py +++ b/pgscatalog_utils/download/download_scorefile.py @@ -38,8 +38,12 @@ def download_scorefile() -> None: pgsc_calc_info = args.pgsc_calc if args.efo: - logger.debug("--trait set, querying traits") - pgs_lst = pgs_lst + [query_trait(x, pgsc_calc_info) for x in args.efo] + if args.efo_include_children: + logger.debug("--trait set, querying traits (including PGS for child terms)") + else: + logger.debug("--trait set, querying traits") + pgs_lst = pgs_lst + [query_trait(x, pgsc_calc_info, args.efo_include_children) for x in args.efo] + if args.pgp: logger.debug("--pgp set, querying publications") @@ -133,6 +137,9 @@ def _parse_args(args=None) -> argparse.Namespace: parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)') parser.add_argument('-t', '--efo', dest='efo', nargs='+', help='Traits described by an EFO term(s) (e.g. EFO_0004611)') + parser.add_argument('-e', '--efo_direct', dest='efo_include_children', action='store_false', + help=' Return only PGS tagged with exact EFO term ' + '(e.g. no PGS for child/descendant terms in the ontology)') parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+') parser.add_argument('-b', '--build', dest='build', choices=['GRCh37', 'GRCh38'], help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38') diff --git a/pgscatalog_utils/download/trait.py b/pgscatalog_utils/download/trait.py index 609e3e1..9099bfa 100644 --- a/pgscatalog_utils/download/trait.py +++ b/pgscatalog_utils/download/trait.py @@ -6,16 +6,19 @@ logger = logging.getLogger(__name__) -def query_trait(trait: str, user_agent:str = None) -> list[str]: +def query_trait(trait: str, user_agent:str = None, include_children:bool = True) -> list[str]: logger.debug(f"Querying PGS Catalog with trait {trait}") - api: str = f'/trait/{trait}?include_children=1' + api: str = f'/trait/{trait}?include_children=0' results_json = query_api(api, user_agent) if results_json == {} or results_json == None: logger.critical(f"Bad response from PGS Catalog for EFO term: {trait}") raise Exception - keys: list[str] = ['associated_pgs_ids', 'child_associated_pgs_ids'] + keys: list[str] = ['associated_pgs_ids'] + if include_children: + keys.append('child_associated_pgs_ids') + pgs: list[str] = [] for key in keys: pgs.append(results_json.get(key)) From dbae744ab80dd97cf2f7b5a4c8ab37f66cab4726 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 21 Oct 2022 14:11:41 +0100 Subject: [PATCH 14/54] Update assertion to make sure the same ID isn't present in multiple included matches (e.g. variant-double counting). Previously it would have been considering across mutliple scores. --- pgscatalog_utils/match/combine_matches.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 3950952..601d5c7 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -26,12 +26,10 @@ def combine_matches(): scorefile = read_scorefile(path=args.scorefile, chrom=None) # chrom=None to read all variants matches = pl.concat(pl.collect_all([pl.scan_ipc(x, memory_map=False) for x in args.matches])) - # make sure there's no duplicate variant_ids across pvars + # make sure there's no duplicate variant_ids across matches in multiple pvars # processing batched chromosomes with overlapping variants might cause problems # e.g. chr1 1-100000, chr1 100001-500000 - n_matched = matches.filter(pl.col('match_status') == 'matched').shape[0] - n_unique = matches.filter(pl.col('match_status') == 'matched').select(pl.col('ID')).unique().shape[0] - assert n_matched == n_unique, "Duplicate IDs in final matches" + assert matches.filter(pl.col('match_status') == 'matched').groupby(['accession', 'ID']).count()['count'] == 1, "Duplicate IDs in final matches" dataset = args.dataset.replace('_', '-') # _ used as delimiter in pgsc_calc log_and_write(matches=matches.lazy(), scorefile=scorefile, dataset=dataset, args=args) From 50868f5dc0e0846efdfea48d13f98a577c30d848 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 21 Oct 2022 14:13:53 +0100 Subject: [PATCH 15/54] Fixed missing max statement --- pgscatalog_utils/match/combine_matches.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 601d5c7..d32e6f7 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -29,7 +29,7 @@ def combine_matches(): # make sure there's no duplicate variant_ids across matches in multiple pvars # processing batched chromosomes with overlapping variants might cause problems # e.g. chr1 1-100000, chr1 100001-500000 - assert matches.filter(pl.col('match_status') == 'matched').groupby(['accession', 'ID']).count()['count'] == 1, "Duplicate IDs in final matches" + assert matches.filter(pl.col('match_status') == 'matched').groupby(['accession', 'ID']).count()['count'].max() == 1, "Duplicate IDs in final matches" dataset = args.dataset.replace('_', '-') # _ used as delimiter in pgsc_calc log_and_write(matches=matches.lazy(), scorefile=scorefile, dataset=dataset, args=args) From 01e07188d05971105765c5615b43a316deabd98f Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 21 Oct 2022 17:46:04 +0100 Subject: [PATCH 16/54] lazy write --- pgscatalog_utils/match/write.py | 99 +++++++++++++++------------------ 1 file changed, 46 insertions(+), 53 deletions(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index b14d60c..ac4140e 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -35,34 +35,44 @@ def write_out(df: pl.LazyFrame, split: bool, outdir: str, dataset: str) -> None: if not os.path.isdir(outdir): os.mkdir(outdir) - logger.debug("Splitting by effect type") - effect_types: dict[str, pl.DataFrame] = _split_effect_type(df.collect()) - - logger.debug("Deduplicating variants") - deduplicated: dict[str, pl.DataFrame] = {k: _deduplicate_variants(k, v) for k, v in effect_types.items()} - - logger.debug("Writing out scorefiles") - ea_dict: dict[str, str] = {'is_dominant': 'dominant', 'is_recessive': 'recessive', 'additive': 'additive'} - [_write_scorefile(ea_dict.get(k), v, split, outdir, dataset) for k, v in deduplicated.items()] - - -def _write_scorefile(effect_type: str, scorefiles: pl.DataFrame, split: bool, outdir: str, dataset: str) -> None: + for effect_type in ['additive', 'dominant', 'recessive']: + logger.debug(f"Splitting by effect type {effect_type}") + for i, x in enumerate(_deduplicate_variants(effect_type, df)): + effect_df: pl.LazyFrame = x.filter(pl.col('effect_type') == effect_type) + if effect_df.fetch().shape[0] > 0: + chroms: list[int] = effect_df.select("chr_name").unique().collect().get_column("chr_name").to_list() + params = {'chroms': chroms, 'effect_type': effect_type, 'i': str(i)} + _write_scorefile(params=params, scorefiles=effect_df, split=split, outdir=outdir, dataset=dataset) + else: + logger.debug(f"{effect_type} empty, skipping writing out") + continue + + logger.debug("All scorefiles written, goodbye!") + +def _write_scorefile(params: dict, scorefiles: pl.LazyFrame, split: bool, outdir: str, dataset: str) -> None: """ Write a list of scorefiles with the same effect type """ - # each list element contains a dataframe of variants - # lists are split to ensure variants have unique ID - effect alleles - for i, scorefile in enumerate(scorefiles): - df_dict: dict[str, pl.DataFrame] = _format_scorefile(scorefile, split) # may be split by chrom + effect_type = params.get('effect_type') + i = params.get('i') + chroms = params.get('chroms') + + dfs: list[pl.LazyFrame] = _format_scorefile(scorefiles, chroms) - for k, v in df_dict.items(): - chr = k.replace("false", "ALL") - path: str = os.path.join(outdir, f"{dataset}_{chr}_{effect_type}_{i}.scorefile.gz") - logger.debug(f"Writing matched scorefile to {path}") + if not split: + logger.debug("Writing combined scorefile") + chroms: list[str] = ["ALL"] # reset chroms list and merge into one df + out_dfs: list[pl.LazyFrame] = [pl.concat(dfs)] + else: + out_dfs: list[pl.LazyFrame] = dfs + logger.debug("Writing split scorefiles") - with gzip.open(path, 'wb') as f: - v.write_csv(f, sep="\t") + for chrom, scorefile in zip(chroms, out_dfs): + fout: str = os.path.join(outdir, f"{dataset}_{chrom}_{effect_type}_{i}.scorefile.gz") + logger.debug(f"Writing matched scorefile to {fout}") + with gzip.open(fout, 'wb') as f: + scorefile.collect().write_csv(f, sep="\t") -def _format_scorefile(df: pl.DataFrame, split: bool) -> dict[str, pl.DataFrame]: +def _format_scorefile(df: pl.LazyFrame, chroms: list[str]) -> list[pl.LazyFrame]: """ Format a dataframe to plink2 --score standard Minimum example: ID | effect_allele | effect_weight @@ -70,30 +80,17 @@ def _format_scorefile(df: pl.DataFrame, split: bool) -> dict[str, pl.DataFrame]: ID | effect_allele | weight_1 | ... | weight_n """ logger.debug("Formatting scorefile to plink2 standard") - if split: - logger.debug("Split output requested") - chroms: list[int] = df["chr_name"].unique().to_list() - return {x: (df.filter(pl.col("chr_name") == x) - .pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") - .rename({"matched_effect_allele": "effect_allele"}) - .fill_null(strategy="zero")) - for x in chroms} - else: - logger.debug("Split output not requested") - formatted: pl.DataFrame = ( - df.pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") - .rename({"matched_effect_allele": "effect_allele"}) - .fill_null(strategy="zero")) - return {'false': formatted} - + dfs = [] + for chrom in chroms: + dfs.append(df.filter(pl.col("chr_name") == chrom).collect() + .pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") + .rename({"matched_effect_allele": "effect_allele"}) + .fill_null(strategy="zero") + .lazy()) + return dfs -def _split_effect_type(df: pl.DataFrame) -> dict[str, pl.DataFrame]: - logger.debug("Splitting matches by effect type") - effect_types: list[str] = df["effect_type"].unique().to_list() - return {x: df.filter(pl.col("effect_type") == x) for x in effect_types} - -def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFrame]: +def _deduplicate_variants(effect_type: str, df: pl.LazyFrame) -> list[pl.LazyFrame]: """ Find variant matches that have duplicate identifiers When merging a lot of scoring files, sometimes a variant might be duplicated this can happen when the matched effect allele differs at the same position, e.g.: @@ -111,23 +108,21 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra # handled by pivoting, and it's pointless to split them unnecessarily # 2. use cumcount to number duplicate IDs # 3. join cumcount data on original DF, use this data for splitting - ea_count: pl.DataFrame = (df.select(["ID", "matched_effect_allele"]) + ea_count: pl.LazyFrame = (df.select(["ID", "matched_effect_allele"]) .unique() .with_columns([ pl.col("ID").cumcount().over(["ID"]).alias("cumcount"), pl.col("ID").count().over(["ID"]).alias("count") ])) - dup_label: pl.DataFrame = df.join(ea_count, on=["ID", "matched_effect_allele"], how="left") + dup_label: pl.LazyFrame = df.join(ea_count, on=["ID", "matched_effect_allele"], how="left") # now split the matched variants, and make sure we don't lose any - n_splits: int = ea_count.select("cumcount").max()[0, 0] + 1 # cumcount = ngroup-1 + n_splits: int = ea_count.select("cumcount").max().collect()[0, 0] + 1 # cumcount = ngroup-1 df_lst: list = [] - n_var: int = 0 for i in range(0, n_splits): - x: pl.DataFrame = dup_label.filter(pl.col("cumcount") == i) - n_var += x.shape[0] + x: pl.LazyFrame = dup_label.filter(pl.col("cumcount") == i) df_lst.append(x) if len(df_lst) > 1: @@ -135,6 +130,4 @@ def _deduplicate_variants(effect_type: str, df: pl.DataFrame) -> list[pl.DataFra else: logger.debug(f"No duplicate variant identifiers found for effect type {effect_type}") - assert n_var == df.shape[0] - return df_lst From 5a202b5157731c74a8426a63af6894d7da6bdf66 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 21 Oct 2022 18:56:38 +0100 Subject: [PATCH 17/54] fix test with extreme prejudice --- tests/match/test_match.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/match/test_match.py b/tests/match/test_match.py index f96ffd2..b7ab82c 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -10,20 +10,6 @@ from pgscatalog_utils.match.match_variants import match_variants -def test_match_pass(mini_scorefile, target_path, tmp_path): - out_dir = str(tmp_path.resolve()) - - args: list[str] = ['match_variants', '-s', mini_scorefile, - '-t', target_path, - '-d', 'test', - '--min_overlap', 0, - '--outdir', out_dir, - '--keep_ambiguous', '--keep_multiallelic'] - - with patch('sys.argv', args): - match_variants() - - def test_match_fail(mini_scorefile, target_path, tmp_path): out_dir = str(tmp_path.resolve()) From 452f76cafc0faaf1ffe65bd5b8ae2e4099417f54 Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Tue, 25 Oct 2022 10:20:59 +0100 Subject: [PATCH 18/54] Fix typo --- pgscatalog_utils/scorefile/qc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index f88636d..68e511c 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -86,7 +86,7 @@ def _check_shape(df: pd.DataFrame) -> None: def _check_columns(df: pd.DataFrame) -> None: - assert {'chr_name', 'chr_position'}.issubset(df.columns), "Missing chromsomal positions. If you're " \ + assert {'chr_name', 'chr_position'}.issubset(df.columns), "Missing chromosomal positions. If you're " \ "using PGS Catalog files with rsIDs you should request " \ "harmonised data files (HmPOS) instead." assert 'effect_allele' in df, "ERROR: Missing effect allele column" From 4944e7d3df4fb5dcdbae80e2280e694398434bbc Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Tue, 25 Oct 2022 10:23:10 +0100 Subject: [PATCH 19/54] Fix basename function when a file name contains 'dot' characters outside the extension(s) --- pgscatalog_utils/scorefile/read.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py index 14cb52d..f38a551 100644 --- a/pgscatalog_utils/scorefile/read.py +++ b/pgscatalog_utils/scorefile/read.py @@ -52,7 +52,12 @@ def _scorefile_dtypes() -> dict[str]: def _get_basename(path: str) -> str: """ Return the basename of a scoring file without extension """ - return os.path.basename(path).split('.')[0] + filename = os.path.basename(path) + if filename.endswith('.txt.gz'): + filename = filename.replace('.txt.gz', '') + elif filename.endswith('.txt'): + filename = filename.replace('.txt', '') + return filename remap_header = { From 758a843c0c8d16971678bc89be33d598908696b7 Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Tue, 25 Oct 2022 10:26:08 +0100 Subject: [PATCH 20/54] Generate a combined log file containing score headers information --- .../scorefile/combine_scorefiles.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index bcafa61..c207dd6 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -3,6 +3,7 @@ import os import sys import textwrap +import json from pgscatalog_utils.config import set_logging_level from pgscatalog_utils.scorefile.effect_type import set_effect_type @@ -15,6 +16,17 @@ from pgscatalog_utils.scorefile.write import write_scorefile +json_logs_filename = 'combined_log.json' +headers2logs = [ + 'pgs_name', + 'genome_build', + 'variants_number', + 'trait_efo', + 'trait_mapped', + 'citation' +] + + def combine_scorefiles(): args = _parse_args() @@ -28,6 +40,10 @@ def combine_scorefiles(): logger.critical(f"Output file {args.outfile} already exists") raise Exception + # Score header logs - init + score_logs = {} + json_logs_file = os.path.dirname(args.outfile)+'/'+json_logs_filename + for x in paths: # Read scorefile df and header h, score = load_scorefile(x) @@ -48,6 +64,24 @@ def combine_scorefiles(): f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}") raise Exception + # Build Score header logs + pgs_id = h.get('pgs_id') + score_header = score_logs[pgs_id] = {} + # Scoring file headers + for header in headers2logs: + header_val = h.get(header) + if header.startswith('trait'): + header_val = header_val.split(',') + score_header[header] = header_val + # Other header information + score_header['columns'] = list(score.columns) + score_header['use_harmonised'] = use_harmonised + score_header['use_liftover'] = False + if use_harmonised: + score_header['sources'] = sorted(score['hm_source'].unique().tolist()) + if args.liftover: + score_header['use_liftover'] = True + # Process/QC score and check variant columns score = (score.pipe(remap_harmonised, use_harmonised=use_harmonised) .pipe(quality_control, drop_missing=args.drop_missing) @@ -81,6 +115,10 @@ def combine_scorefiles(): write_scorefile(score, args.outfile) + # Write Score header logs file + with open(json_logs_file, 'w') as fp: + json.dump(score_logs, fp) + def _description_text() -> str: return textwrap.dedent('''\ From 721e3ec20bec891b2a4a97406676dbfd90840f2e Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Wed, 26 Oct 2022 11:21:44 +0100 Subject: [PATCH 21/54] Add more Harmonization logs and liftover logs. Change the log ID by using the file name instead of the PGS ID --- .../scorefile/combine_scorefiles.py | 53 ++++++++++++------- pgscatalog_utils/scorefile/read.py | 4 +- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index c207dd6..3a4c9ed 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -12,12 +12,13 @@ from pgscatalog_utils.scorefile.harmonised import remap_harmonised from pgscatalog_utils.scorefile.liftover import liftover from pgscatalog_utils.scorefile.qc import quality_control -from pgscatalog_utils.scorefile.read import load_scorefile +from pgscatalog_utils.scorefile.read import load_scorefile, get_scorefile_basename from pgscatalog_utils.scorefile.write import write_scorefile json_logs_filename = 'combined_log.json' headers2logs = [ + 'pgs_id', 'pgs_name', 'genome_build', 'variants_number', @@ -25,7 +26,12 @@ 'trait_mapped', 'citation' ] - +headers2logs_harmonisation = [ + 'HmPOS_build', + 'HmPOS_date', + 'HmPOS_match_chr', + 'HmPOS_match_pos' +] def combine_scorefiles(): args = _parse_args() @@ -64,24 +70,6 @@ def combine_scorefiles(): f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}") raise Exception - # Build Score header logs - pgs_id = h.get('pgs_id') - score_header = score_logs[pgs_id] = {} - # Scoring file headers - for header in headers2logs: - header_val = h.get(header) - if header.startswith('trait'): - header_val = header_val.split(',') - score_header[header] = header_val - # Other header information - score_header['columns'] = list(score.columns) - score_header['use_harmonised'] = use_harmonised - score_header['use_liftover'] = False - if use_harmonised: - score_header['sources'] = sorted(score['hm_source'].unique().tolist()) - if args.liftover: - score_header['use_liftover'] = True - # Process/QC score and check variant columns score = (score.pipe(remap_harmonised, use_harmonised=use_harmonised) .pipe(quality_control, drop_missing=args.drop_missing) @@ -115,6 +103,31 @@ def combine_scorefiles(): write_scorefile(score, args.outfile) + # Build Score header logs + score_id = get_scorefile_basename(x) + score_header = score_logs[score_id] = {} + # Scoring file header information + for header in headers2logs: + header_val = h.get(header) + if header.startswith('trait'): + header_val = header_val.split(',') + score_header[header] = header_val + # Other header information + score_header['columns'] = list(score.columns) + score_header['use_liftover'] = False + if args.liftover: + score_header['use_liftover'] = True + # Harmonized header information + score_header['use_harmonised'] = use_harmonised + if use_harmonised: + score_header['sources'] = sorted(score['hm_source'].unique().tolist()) + for hm_header in headers2logs_harmonisation: + hm_header_val = h.get(hm_header) + if hm_header_val: + if hm_header.startswith('HmPOS_match'): + hm_header_val = json.loads(hm_header_val) + score_header[hm_header] = hm_header_val + # Write Score header logs file with open(json_logs_file, 'w') as fp: json.dump(score_logs, fp) diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py index f38a551..dbd559b 100644 --- a/pgscatalog_utils/scorefile/read.py +++ b/pgscatalog_utils/scorefile/read.py @@ -12,7 +12,7 @@ def load_scorefile(path: str) -> tuple[dict, pd.DataFrame]: logger.debug(f'Reading scorefile {path}') df = pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) return (_read_header(path), - df.assign(filename_prefix=_get_basename(path), filename=path, row_nr=df.index)) + df.assign(filename_prefix=get_scorefile_basename(path), filename=path, row_nr=df.index)) def _read_header(path: str) -> dict: @@ -50,7 +50,7 @@ def _scorefile_dtypes() -> dict[str]: 'hm_chr': str, 'hm_pos': pd.UInt64Dtype(), 'hm_inferOtherAllele': str} -def _get_basename(path: str) -> str: +def get_scorefile_basename(path: str) -> str: """ Return the basename of a scoring file without extension """ filename = os.path.basename(path) if filename.endswith('.txt.gz'): From 800121010a6652a5ba779ded5bebc3ed5fe59c96 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 4 Nov 2022 11:00:17 +0000 Subject: [PATCH 22/54] improve build cache and add build argument --- Dockerfile | 32 ++++++++++++++++++++++---------- install.sh | 7 +++++++ 2 files changed, 29 insertions(+), 10 deletions(-) create mode 100644 install.sh diff --git a/Dockerfile b/Dockerfile index 0549d2a..2f6d47c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,30 @@ + FROM python:3.10 as builder + +# docker build --build-arg "ENV=PROD" ... + +ARG ENV + +RUN apt-get update && apt-get install -y sqlite3 + WORKDIR /app -COPY . /app/ -RUN pip install poetry && poetry config virtualenvs.in-project true && \ - poetry install --no-ansi --no-dev - -RUN poetry build +RUN pip install poetry + +RUN python -m venv /venv + +COPY install.sh poetry.lock pyproject.toml /app + +RUN chmod +x install.sh && ./install.sh + +COPY . . + +RUN poetry build && /venv/bin/pip install dist/*.whl -FROM python:3.10 +FROM builder as final -WORKDIR /opt/ +COPY --from=builder /venv /venv -COPY --from=builder /app/dist/pgscatalog_utils-0.3.0-py3-none-any.whl . +ENV PATH="/venv/bin:${PATH}" -RUN pip install pgscatalog_utils-0.3.0-py3-none-any.whl -RUN apt-get update && apt-get install -y sqlite3 \ No newline at end of file diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..0ae598e --- /dev/null +++ b/install.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env sh + +if [ ${ENV} = "DEV" ]; then + poetry export --dev --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin +else + poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin +fi From 5b8b8ecd0fadd43140fa3817a8132d303ecc451a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 4 Nov 2022 17:12:04 +0000 Subject: [PATCH 23/54] collect lazy matches just before writing --- pgscatalog_utils/config.py | 3 +- pgscatalog_utils/match/combine_matches.py | 8 +- pgscatalog_utils/match/filter.py | 4 +- pgscatalog_utils/match/match_variants.py | 5 +- pgscatalog_utils/match/write.py | 129 +++++++++++----------- poetry.lock | 23 ++-- pyproject.toml | 1 + 7 files changed, 98 insertions(+), 75 deletions(-) diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index 7a6b8eb..0fe2bf9 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -1,6 +1,7 @@ import logging -POLARS_MAX_THREADS = 1 # dummy value, is reset by args.n_threads (default: 1) +POLARS_MAX_THREADS: int = 1 # dummy value, is reset by args.n_threads (default: 1) +OUTDIR: str = "." # dummy value, reset by args.outdir def set_logging_level(verbose: bool): diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index d32e6f7..0303840 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -16,7 +16,8 @@ def combine_matches(): config.set_logging_level(args.verbose) config.POLARS_MAX_THREADS = args.n_threads - os.environ['POLARS_MAX_THREADS'] = str(config.POLARS_MAX_THREADS) + config.OUTDIR = args.outdir + os.environ['POLARS_MAX_THREADS'] = str(config.POLARS_MAX_THREADS) # TODO: this won't work (after import) # now the environment variable, parsed argument args.n_threads, and threadpool should agree logger.debug(f"Setting POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}") logger.debug(f"Using {config.POLARS_MAX_THREADS} threads to read CSVs") @@ -24,7 +25,10 @@ def combine_matches(): with pl.StringCache(): scorefile = read_scorefile(path=args.scorefile, chrom=None) # chrom=None to read all variants - matches = pl.concat(pl.collect_all([pl.scan_ipc(x, memory_map=False) for x in args.matches])) + logger.debug("Reading matches") + matches = pl.concat([pl.read_ipc(x, memory_map=False, rechunk=False) for x in args.matches], rechunk=False) + logger.debug("Rechunking matches") + matches.rechunk() # make sure there's no duplicate variant_ids across matches in multiple pvars # processing batched chromosomes with overlapping variants might cause problems diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py index 695b5c2..14e3ed9 100644 --- a/pgscatalog_utils/match/filter.py +++ b/pgscatalog_utils/match/filter.py @@ -27,7 +27,9 @@ def filter_scores(scorefile: pl.LazyFrame, matches: pl.LazyFrame, min_overlap: f score_summary: pl.LazyFrame = pl.concat(scores).lazy() filtered_scores: pl.LazyFrame = (filtered_matches.join(score_summary, on='accession', how='left') - .filter(pl.col('score_pass') == True)) + .filter(pl.col('score_pass') == True) + .select(['chr_name', 'ID', 'accession', 'effect_type', 'matched_effect_allele', + 'effect_weight'])) return filtered_scores, score_summary diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 0f8224c..bce3344 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -86,16 +86,17 @@ def log_and_write(matches: pl.LazyFrame, scorefile: pl.LazyFrame, dataset: str, logger.critical("Error: no target variants match any variants in scoring files") raise Exception("No valid matches found") + write_out(valid_matches, args.split, dataset) + del valid_matches + big_log: pl.LazyFrame = make_logs(scorefile=scorefile, match_candidates=matches, dataset=dataset) summary_log: pl.LazyFrame = make_summary_log(match_candidates=matches, filter_summary=filter_summary, dataset=dataset, scorefile=scorefile) check_log_count(summary_log=summary_log, scorefile=scorefile) - write_log(df=big_log, prefix=dataset, chrom=None, outdir=args.outdir, file_format="csv") summary_log.collect().write_csv(f"{dataset}_summary.csv") - write_out(valid_matches, args.split, args.outdir, dataset) def _check_target_chroms(target: pl.LazyFrame) -> None: diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index ac4140e..33d785f 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -1,13 +1,52 @@ -import gzip import logging import os import typing +import pgzip import polars as pl +from pgscatalog_utils import config logger = logging.getLogger(__name__) +def write_out(matches: pl.LazyFrame, split: bool, dataset: str): + chroms: list[str] = matches.select("chr_name").unique().collect().get_column("chr_name").to_list() + for chrom in chroms: + # 1. filter by chromosome + chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom) + # 2. split by effect type + additive: pl.LazyFrame + dominant: pl.LazyFrame + recessive: pl.LazyFrame + additive, dominant, recessive = _split_effect_type(chrom_df) + + # 3. deduplicate + effect_types = ['additive', 'dominant', 'recessive'] + deduped = dict(zip(effect_types, [_deduplicate_variants(x) for x in [additive, dominant, recessive]])) + + # 4. pivot and write! + _write_split(deduped, chrom, dataset) + + +def _write_split(deduplicated: dict[str: tuple[int, pl.LazyFrame]], chrom: str, dataset: str): + for effect_type, df_lst in deduplicated.items(): + for i, et_df in df_lst: + if i is False: + # deduplication returned an empty dataframe, so skip (normally recessive or dominant) + continue + + # pivoting is !! _expensive_ !! (it collects the lazyframe) + pivoted: pl.LazyFrame = _pivot_score(et_df, chrom) + fout = os.path.join(config.OUTDIR, f"{dataset}_{chrom}_{effect_type}_{i}.scorefile.gz") + _write_scorefile(pivoted, fout) + + +def _write_scorefile(df, fout): + logger.debug(f"Writing matched scorefile to {fout}") + with pgzip.open(fout, 'wb', thread=config.POLARS_MAX_THREADS) as f: + df.collect().write_csv(f) + + def write_log(df: pl.LazyFrame, prefix: str, chrom: typing.Union[str, None], file_format: str, outdir: str) -> None: # feather file preserves dtypes and is small # don't compress the feather file to allow memory mapping @@ -24,73 +63,30 @@ def write_log(df: pl.LazyFrame, prefix: str, chrom: typing.Union[str, None], fil case 'csv': fout: str = ''.join([log_name, ".csv.gz"]) logger.debug(f"Writing {fout} in format: {file_format}") - with gzip.open(fout, 'wb') as f: + with pgzip.open(fout, 'wb', thread=config.POLARS_MAX_THREADS) as f: df.collect().write_csv(f) case _: logger.critical(f"Invalid format: {file_format}") raise Exception -def write_out(df: pl.LazyFrame, split: bool, outdir: str, dataset: str) -> None: - if not os.path.isdir(outdir): - os.mkdir(outdir) - - for effect_type in ['additive', 'dominant', 'recessive']: - logger.debug(f"Splitting by effect type {effect_type}") - for i, x in enumerate(_deduplicate_variants(effect_type, df)): - effect_df: pl.LazyFrame = x.filter(pl.col('effect_type') == effect_type) - if effect_df.fetch().shape[0] > 0: - chroms: list[int] = effect_df.select("chr_name").unique().collect().get_column("chr_name").to_list() - params = {'chroms': chroms, 'effect_type': effect_type, 'i': str(i)} - _write_scorefile(params=params, scorefiles=effect_df, split=split, outdir=outdir, dataset=dataset) - else: - logger.debug(f"{effect_type} empty, skipping writing out") - continue - - logger.debug("All scorefiles written, goodbye!") - -def _write_scorefile(params: dict, scorefiles: pl.LazyFrame, split: bool, outdir: str, dataset: str) -> None: - """ Write a list of scorefiles with the same effect type """ - effect_type = params.get('effect_type') - i = params.get('i') - chroms = params.get('chroms') - - dfs: list[pl.LazyFrame] = _format_scorefile(scorefiles, chroms) - - if not split: - logger.debug("Writing combined scorefile") - chroms: list[str] = ["ALL"] # reset chroms list and merge into one df - out_dfs: list[pl.LazyFrame] = [pl.concat(dfs)] - else: - out_dfs: list[pl.LazyFrame] = dfs - logger.debug("Writing split scorefiles") - - for chrom, scorefile in zip(chroms, out_dfs): - fout: str = os.path.join(outdir, f"{dataset}_{chrom}_{effect_type}_{i}.scorefile.gz") - logger.debug(f"Writing matched scorefile to {fout}") - with gzip.open(fout, 'wb') as f: - scorefile.collect().write_csv(f, sep="\t") - - -def _format_scorefile(df: pl.LazyFrame, chroms: list[str]) -> list[pl.LazyFrame]: +def _pivot_score(df: pl.LazyFrame, chrom: str) -> pl.LazyFrame: """ Format a dataframe to plink2 --score standard Minimum example: ID | effect_allele | effect_weight Multiple scores are OK too: ID | effect_allele | weight_1 | ... | weight_n """ - logger.debug("Formatting scorefile to plink2 standard") - dfs = [] - for chrom in chroms: - dfs.append(df.filter(pl.col("chr_name") == chrom).collect() - .pivot(index=["ID", "matched_effect_allele"], values="effect_weight", columns="accession") - .rename({"matched_effect_allele": "effect_allele"}) - .fill_null(strategy="zero") - .lazy()) - return dfs + logger.debug(f"Pivoting score for chromosome {chrom}") + return (df.collect() + .pivot(index=["ID", "matched_effect_allele", "effect_type"], values="effect_weight", + columns="accession") + .rename({"matched_effect_allele": "effect_allele"}) + .fill_null(strategy="zero") + .lazy()) -def _deduplicate_variants(effect_type: str, df: pl.LazyFrame) -> list[pl.LazyFrame]: +def _deduplicate_variants(df: pl.LazyFrame) -> list[tuple[int, pl.LazyFrame]]: """ Find variant matches that have duplicate identifiers When merging a lot of scoring files, sometimes a variant might be duplicated this can happen when the matched effect allele differs at the same position, e.g.: @@ -104,11 +100,18 @@ def _deduplicate_variants(effect_type: str, df: pl.LazyFrame) -> list[pl.LazyFra Returns: A list of dataframes, with unique ID - matched effect allele combinations """ + if df.select('ID').head().collect().is_empty(): + logger.info("Empty input: skipping deduplication") + return [(False, df)] + else: + logger.debug("Deduplicating variants") + # 1. unique ID - EA is important because normal duplicates are already # handled by pivoting, and it's pointless to split them unnecessarily # 2. use cumcount to number duplicate IDs # 3. join cumcount data on original DF, use this data for splitting - ea_count: pl.LazyFrame = (df.select(["ID", "matched_effect_allele"]) + # note: effect_allele should be equivalent to matched_effect_allele + ea_count: pl.LazyFrame = (df.select(['ID', 'matched_effect_allele']) .unique() .with_columns([ pl.col("ID").cumcount().over(["ID"]).alias("cumcount"), @@ -122,12 +125,14 @@ def _deduplicate_variants(effect_type: str, df: pl.LazyFrame) -> list[pl.LazyFra df_lst: list = [] for i in range(0, n_splits): - x: pl.LazyFrame = dup_label.filter(pl.col("cumcount") == i) - df_lst.append(x) - - if len(df_lst) > 1: - logger.debug(f"Duplicate variant identifiers split for effect type {effect_type}") - else: - logger.debug(f"No duplicate variant identifiers found for effect type {effect_type}") + x: pl.LazyFrame = (dup_label.filter(pl.col("cumcount") == i).drop(['cumcount', 'count'])) + df_lst.append((i, x)) return df_lst + + +def _split_effect_type(df: pl.LazyFrame) -> tuple[pl.LazyFrame, pl.LazyFrame, pl.LazyFrame]: + additive = df.filter(pl.col('effect_type') == 'additive') + dominant = df.filter(pl.col('effect_type') == 'dominant') + recessive = df.filter(pl.col('effect_type') == 'recessive') + return additive, dominant, recessive diff --git a/poetry.lock b/poetry.lock index 2ae26df..f77892b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -223,6 +223,14 @@ numpy = "*" packaging = "*" pandas = ">=0.19" +[[package]] +name = "pgzip" +version = "0.3.2" +description = "A multi-threading implementation of Python gzip module" +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "pillow" version = "9.2.0" @@ -249,22 +257,22 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.14.17" +version = "0.14.25" description = "Blazingly fast DataFrame library" category = "main" optional = false python-versions = ">=3.7" [package.extras] -pandas = ["pyarrow (>=4.0.0)", "pandas"] -connectorx = ["connectorx"] -xlsx2csv = ["xlsx2csv (>=0.8.0)"] -timezone = ["backports.zoneinfo", "tzdata"] matplotlib = ["matplotlib"] -fsspec = ["fsspec"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +connectorx = ["connectorx"] +pandas = ["pyarrow (>=4.0.0)", "pandas"] numpy = ["numpy (>=1.16.0)"] +fsspec = ["fsspec"] all = ["polars"] pyarrow = ["pyarrow (>=4.0.0)"] +timezone = ["backports.zoneinfo", "tzdata"] [[package]] name = "psutil" @@ -463,7 +471,7 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "1.1" python-versions = "^3.10" -content-hash = "84b4520b176bb1b892c870fe894814cd05e217a86d7b4fadfa638b91a919bae5" +content-hash = "8bc3a7d9cda455b0a63bce20b944feb398b0a29cb20b2f04c366a6817c5d70fa" [metadata.files] attrs = [] @@ -488,6 +496,7 @@ numpy = [] packaging = [] pandas = [] pandas-schema = [] +pgzip = [] pillow = [] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, diff --git a/pyproject.toml b/pyproject.toml index a1e2d19..3fc9cbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ requests = "^2.28.1" jq = "^1.2.2" polars = "^0.14.9" zstandard = "^0.18.0" +pgzip = "^0.3.2" [tool.poetry.dev-dependencies] pytest = "^7.1.2" From d932b4dacff65165368475a14e3a088886874e00 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 9 Nov 2022 14:41:14 +0000 Subject: [PATCH 24/54] update dependencies --- poetry.lock | 750 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 692 insertions(+), 58 deletions(-) diff --git a/poetry.lock b/poetry.lock index f77892b..10a3848 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7,10 +7,10 @@ optional = false python-versions = ">=3.5" [package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] -docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "mypy (>=0.900,!=0.940)", "pytest-mypy-plugins", "cloudpickle"] +dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy (>=0.900,!=0.940)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "sphinx", "sphinx-notfound-page", "zope.interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] +tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"] +tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"] [[package]] name = "certifi" @@ -40,7 +40,7 @@ optional = false python-versions = ">=3.6.0" [package.extras] -unicode_backport = ["unicodedata2"] +unicode-backport = ["unicodedata2"] [[package]] name = "colorama" @@ -62,11 +62,11 @@ python-versions = ">=3.7" numpy = ">=1.16" [package.extras] -test-no-codebase = ["pillow", "matplotlib", "pytest"] +bokeh = ["bokeh", "selenium"] +docs = ["docutils (<0.18)", "sphinx", "sphinx-rtd-theme"] +test = ["Pillow", "flake8", "isort", "matplotlib", "pytest"] test-minimal = ["pytest"] -test = ["isort", "flake8", "pillow", "matplotlib", "pytest"] -docs = ["sphinx-rtd-theme", "sphinx", "docutils (<0.18)"] -bokeh = ["selenium", "bokeh"] +test-no-codebase = ["Pillow", "matplotlib", "pytest"] [[package]] name = "coverage" @@ -99,9 +99,9 @@ optional = false python-versions = ">=3.7" [package.extras] -all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"] +all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.23.0)", "unicodedata2 (>=14.0.0)", "xattr", "zopfli (>=0.1.4)"] graphite = ["lz4 (>=1.7.4.2)"] -interpolatable = ["scipy", "munkres"] +interpolatable = ["munkres", "scipy"] lxml = ["lxml (>=4.0,<5)"] pathops = ["skia-pathops (>=0.5.0)"] plot = ["matplotlib"] @@ -110,7 +110,7 @@ symfont = ["sympy"] type1 = ["xattr"] ufo = ["fs (>=2.2.0,<3)"] unicode = ["unicodedata2 (>=14.0.0)"] -woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"] +woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] [[package]] name = "idna" @@ -208,7 +208,7 @@ python-dateutil = ">=2.8.1" pytz = ">=2020.1" [package.extras] -test = ["pytest-xdist (>=1.31)", "pytest (>=6.0)", "hypothesis (>=5.5.3)"] +test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] [[package]] name = "pandas-schema" @@ -257,22 +257,22 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.14.25" +version = "0.14.26" description = "Blazingly fast DataFrame library" category = "main" optional = false python-versions = ">=3.7" [package.extras] -matplotlib = ["matplotlib"] -xlsx2csv = ["xlsx2csv (>=0.8.0)"] +all = ["polars[connectorx,fsspec,matplotlib,numpy,pandas,pyarrow,timezone,xlsx2csv]"] connectorx = ["connectorx"] -pandas = ["pyarrow (>=4.0.0)", "pandas"] -numpy = ["numpy (>=1.16.0)"] fsspec = ["fsspec"] -all = ["polars"] +matplotlib = ["matplotlib"] +numpy = ["numpy (>=1.16.0)"] +pandas = ["pandas", "pyarrow (>=4.0.0)"] pyarrow = ["pyarrow (>=4.0.0)"] timezone = ["backports.zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] [[package]] name = "psutil" @@ -283,7 +283,7 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [package.extras] -test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] [[package]] name = "py" @@ -318,7 +318,7 @@ optional = false python-versions = ">=3.6.8" [package.extras] -diagrams = ["railroad-diagrams", "jinja2"] +diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pysqlar" @@ -361,7 +361,7 @@ coverage = {version = ">=5.2.1", extras = ["toml"]} pytest = ">=4.6" [package.extras] -testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] [[package]] name = "python-dateutil" @@ -398,7 +398,20 @@ urllib3 = ">=1.21.1,<1.27" [package.extras] socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "setuptools" +version = "65.5.1" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "setuptools-scm" @@ -410,6 +423,7 @@ python-versions = ">=3.7" [package.dependencies] packaging = ">=20.0" +setuptools = "*" tomli = ">=1.0.0" typing-extensions = "*" @@ -450,8 +464,8 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" [package.extras] -brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -474,55 +488,624 @@ python-versions = "^3.10" content-hash = "8bc3a7d9cda455b0a63bce20b944feb398b0a29cb20b2f04c366a6817c5d70fa" [metadata.files] -attrs = [] -certifi = [] -cffi = [] -charset-normalizer = [] -colorama = [] -contourpy = [] -coverage = [] -cycler = [] -fonttools = [] -idna = [] +attrs = [ + {file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"}, + {file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"}, +] +certifi = [ + {file = "certifi-2022.9.24-py3-none-any.whl", hash = "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"}, + {file = "certifi-2022.9.24.tar.gz", hash = "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14"}, +] +cffi = [ + {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, + {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, + {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, + {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, + {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, + {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, + {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, + {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, + {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, + {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, + {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, + {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, + {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, + {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, + {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, + {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, + {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, + {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, + {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, +] +charset-normalizer = [ + {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"}, + {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"}, +] +colorama = [ + {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, + {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, +] +contourpy = [ + {file = "contourpy-1.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:87121b9428ac568fb84fae4af5e7852fc34f02eadc4e3e91f6c8989327692186"}, + {file = "contourpy-1.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1fb782982c42cee667b892a0b0c52a9f6c7ecf1da5c5f4345845f04eaa862f93"}, + {file = "contourpy-1.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:689d7d2a840619915d0abd1ecc6e399fee202f8ad315acda2807f4ca420d0802"}, + {file = "contourpy-1.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d88814befbd1433152c5f6dd536905149ba028d795a22555b149ae0a36024d9e"}, + {file = "contourpy-1.0.5-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df65f4b2b4e74977f0336bef12a88051ab24e6a16873cd9249f34d67cb3e345d"}, + {file = "contourpy-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6b4c0c723664f65c2a47c8cb6ebbf660b0b2e2d936adf2e8503d4e93359465"}, + {file = "contourpy-1.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bcc98d397c3dea45d5b262029564b29cb8e945f2607a38bee6163694c0a8b4ef"}, + {file = "contourpy-1.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2bf5c846c257578b03d498b20f54f53551616a507d8e5463511c58bb58e9a9cf"}, + {file = "contourpy-1.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cdacddb18d55ffec42d1907079cdc04ec4fa8a990cdf5b9d9fe67d281fc0d12e"}, + {file = "contourpy-1.0.5-cp310-cp310-win32.whl", hash = "sha256:434942fa2f9019b9ae525fb752dc523800c49a1a28fbd6d9240b0fa959573dcc"}, + {file = "contourpy-1.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:3b3082ade8849130203d461b98c2a061b382c46074b43b4edd5cefd81af92b8a"}, + {file = "contourpy-1.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:057114f698ffb9e54657e8fda6802e2f5c8fad609845cf6afaf31590ef6a33c0"}, + {file = "contourpy-1.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:218722a29c5c26677d37c44f5f8a372daf6f07870aad793a97d47eb6ad6b3290"}, + {file = "contourpy-1.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6c02e22cf09996194bcb3a4784099975cf527d5c29caf759abadf29ebdb2fe27"}, + {file = "contourpy-1.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0d5ee865b5fd16bf62d72122aadcc90aab296c30c1adb0a32b4b66bd843163e"}, + {file = "contourpy-1.0.5-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d45822b0a2a452327ab4f95efe368d234d5294bbf89a99968be27c7938a21108"}, + {file = "contourpy-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dca5be83a6dfaf933a46e3bc2b9f2685e5ec61b22f6a38ad740aac9c16e9a0ff"}, + {file = "contourpy-1.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3c3f2f6b898a40207843ae01970e57e33d22a26b22f23c6a5e07b4716751085f"}, + {file = "contourpy-1.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c2b4eab7c12f9cb460509bc34a3b086f9802f0dba27c89a63df4123819ad64af"}, + {file = "contourpy-1.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:09ed9b63f4df8a7591b7a4a26c1ad066dcaafda1f846250fdcb534074a411692"}, + {file = "contourpy-1.0.5-cp311-cp311-win32.whl", hash = "sha256:f670686d99c867d0f24b28ce8c6f02429c6eef5e2674aab287850d0ee2d20437"}, + {file = "contourpy-1.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:c51568e94f7f232296de30002f2a50f77a7bd346673da3e4f2aaf9d2b833f2e5"}, + {file = "contourpy-1.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7c9e99aac7b430f6a9f15eebf058c742097cea3369f23a2bfc5e64d374b67e3a"}, + {file = "contourpy-1.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3210d93ad2af742b6a96cf39792f7181822edbb8fe11c3ef29d1583fe637a8d8"}, + {file = "contourpy-1.0.5-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128bd7acf569f8443ad5b2227f30ac909e4f5399ed221727eeacf0c6476187e6"}, + {file = "contourpy-1.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:813c2944e940ef8dccea71305bacc942d4b193a021140874b3e58933ec44f5b6"}, + {file = "contourpy-1.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:a74afd8d560eaafe0d9e3e1db8c06081282a05ca4de00ee416195085a79d7d3d"}, + {file = "contourpy-1.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d0ad9a85f208473b1f3613c45756c7aa6fcc288266a8c7b873f896aaf741b6b"}, + {file = "contourpy-1.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:60f37acd4e4227c5a29f737d9a85ca3145c529a8dd4bf70af7f0637c61b49222"}, + {file = "contourpy-1.0.5-cp37-cp37m-win32.whl", hash = "sha256:b50e481a4317a8efcfffcfddcd4c9b36eacba440440e70cbe0256aeb6fd6abae"}, + {file = "contourpy-1.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:0395ae71164bfeb2dedd136e03c71a2718a5aa9873a46f518f4133be0d63e1d2"}, + {file = "contourpy-1.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3ca40d7844b391d90b864c6a6d1bb6b88b09035fb4d866d64d43c4d26fb0ab64"}, + {file = "contourpy-1.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3109fa601d2a448cec4643abd3a31f972bf05b7c2f2e83df9d3429878f8c10ae"}, + {file = "contourpy-1.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:06c4d1dde5ee4f909a8a95ba1eb04040c6c26946b4f3b5beaf10d45f14e940ee"}, + {file = "contourpy-1.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f54dcc9bb9390fd0636301ead134d46d5229fe86da0db4d974c0fda349f560e"}, + {file = "contourpy-1.0.5-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46b8e24813e2fb5a3e598c1f8b9ae403e1438cb846a80cc2b33cddf19dddd7f2"}, + {file = "contourpy-1.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:061e1f066c419ffe25b615a1df031b4832ea1d7f2676937e69e8e00e24512005"}, + {file = "contourpy-1.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:19ea64fa0cf389d2ebc10974616acfa1fdecbd73d1fd9c72215b782f3c40f561"}, + {file = "contourpy-1.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfe924e5a63861c82332a12adeeab955dc8c8009ddbbd80cc2fcca049ff89a49"}, + {file = "contourpy-1.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bed3a2a823a041e8d249b1a7ec132933e1505299329b5cfe1b2b5ec689ec7675"}, + {file = "contourpy-1.0.5-cp38-cp38-win32.whl", hash = "sha256:0389349875424aa8c5e61f757e894687916bc4e9616cc6afcbd8051aa2428952"}, + {file = "contourpy-1.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:2b5e334330d82866923015b455260173cb3b9e3b4e297052d758abd262031289"}, + {file = "contourpy-1.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:def9a01b73c9e27d70ea03b381fb3e7aadfac1f398dbd63751313c3a46747ef5"}, + {file = "contourpy-1.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:59c827e536bb5e3ef58e06da0faba61fd89a14f30b68bcfeca41f43ca83a1942"}, + {file = "contourpy-1.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f05d311c937da03b0cd26ac3e14cb991f6ff8fc94f98b3df9713537817539795"}, + {file = "contourpy-1.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:970a4be7ec84ccda7c27cb4ae74930bbbd477bc8d849ed55ea798084dd5fca8c"}, + {file = "contourpy-1.0.5-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f7672148f8fca48e4efc16aba24a7455b40c22d4f8abe42475dec6a12b0bb9a"}, + {file = "contourpy-1.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eba62b7c21a33e72dd8adab2b92dd5610d8527f0b2ac28a8e0770e71b21a13f9"}, + {file = "contourpy-1.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:dd084459ecdb224e617e4ab3f1d5ebe4d1c48facb41f24952b76aa6ba9712bb0"}, + {file = "contourpy-1.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c5158616ab39d34b76c50f40c81552ee180598f7825dc7a66fd187d29958820f"}, + {file = "contourpy-1.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f856652f9b533c6cd2b9ad6836a7fc0e43917d7ff15be46c5baf1350f8cdc5d9"}, + {file = "contourpy-1.0.5-cp39-cp39-win32.whl", hash = "sha256:f1cc623fd6855b25da52b3275e0c9e51711b86a9dccc75f8c9ab4432fd8e42c7"}, + {file = "contourpy-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:e67dcaa34dcd908fcccbf49194211d847c731b6ebaac661c1c889f1bf6af1e44"}, + {file = "contourpy-1.0.5-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bfd634cb9685161b2a51f73a7fc4736fd0d67a56632d52319317afaa27f08243"}, + {file = "contourpy-1.0.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79908b9d02b1d6c1c71ff3b7ad127f3f82e14a8e091ab44b3c7e34b649fea733"}, + {file = "contourpy-1.0.5-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b4963cf08f4320d98ae72ec7694291b8ab85cb7da3b0cd824bc32701bc992edf"}, + {file = "contourpy-1.0.5-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cfc067ddde78b76dcbc9684d82688b7d3c5158fa2254a085f9bcb9586c1e2d8"}, + {file = "contourpy-1.0.5-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:9939796abcadb2810a63dfb26ff8ca4595fe7dd70a3ceae7f607a2639b714307"}, + {file = "contourpy-1.0.5-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:d8150579bf30cdf896906baf256aa200cd50dbe6e565c17d6fd3d678e21ff5de"}, + {file = "contourpy-1.0.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed9c91bf4ce614efed5388c3f989a7cfe08728ab871d995a486ea74ff88993db"}, + {file = "contourpy-1.0.5-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b46a04588ceb7cf132568e0e564a854627ef87a1ed3bf536234540a79ced44b0"}, + {file = "contourpy-1.0.5-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b85553699862c09937a7a5ea14ee6229087971a7d51ae97d5f4b407f571a2c17"}, + {file = "contourpy-1.0.5-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:99a8071e351b50827ad976b92ed91845fb614ac67a3c41109b24f3d8bd3afada"}, + {file = "contourpy-1.0.5-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fb0458d74726937ead9e2effc91144aea5a58ecee9754242f8539a782bed685a"}, + {file = "contourpy-1.0.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f89f0608a5aa8142ed0e53957916623791a88c7f5e5f07ae530c328beeb888f"}, + {file = "contourpy-1.0.5-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce763369e646e59e4ca2c09735cd1bdd3048d909ad5f2bc116e83166a9352f3c"}, + {file = "contourpy-1.0.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c16fa267740d67883899e054cccb4279e002f3f4872873b752c1ba15045ff49"}, + {file = "contourpy-1.0.5-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a30e95274f5c0e007ccc759ec258aa5708c534ec058f153ee25ac700a2f1438b"}, + {file = "contourpy-1.0.5.tar.gz", hash = "sha256:896631cd40222aef3697e4e51177d14c3709fda49d30983269d584f034acc8a4"}, +] +coverage = [ + {file = "coverage-6.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef8674b0ee8cc11e2d574e3e2998aea5df5ab242e012286824ea3c6970580e53"}, + {file = "coverage-6.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:784f53ebc9f3fd0e2a3f6a78b2be1bd1f5575d7863e10c6e12504f240fd06660"}, + {file = "coverage-6.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4a5be1748d538a710f87542f22c2cad22f80545a847ad91ce45e77417293eb4"}, + {file = "coverage-6.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83516205e254a0cb77d2d7bb3632ee019d93d9f4005de31dca0a8c3667d5bc04"}, + {file = "coverage-6.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af4fffaffc4067232253715065e30c5a7ec6faac36f8fc8d6f64263b15f74db0"}, + {file = "coverage-6.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:97117225cdd992a9c2a5515db1f66b59db634f59d0679ca1fa3fe8da32749cae"}, + {file = "coverage-6.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1170fa54185845505fbfa672f1c1ab175446c887cce8212c44149581cf2d466"}, + {file = "coverage-6.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a"}, + {file = "coverage-6.5.0-cp310-cp310-win32.whl", hash = "sha256:5dbec3b9095749390c09ab7c89d314727f18800060d8d24e87f01fb9cfb40b32"}, + {file = "coverage-6.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:59f53f1dc5b656cafb1badd0feb428c1e7bc19b867479ff72f7a9dd9b479f10e"}, + {file = "coverage-6.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4a5375e28c5191ac38cca59b38edd33ef4cc914732c916f2929029b4bfb50795"}, + {file = "coverage-6.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4ed2820d919351f4167e52425e096af41bfabacb1857186c1ea32ff9983ed75"}, + {file = "coverage-6.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:33a7da4376d5977fbf0a8ed91c4dffaaa8dbf0ddbf4c8eea500a2486d8bc4d7b"}, + {file = "coverage-6.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8fb6cf131ac4070c9c5a3e21de0f7dc5a0fbe8bc77c9456ced896c12fcdad91"}, + {file = "coverage-6.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a6b7d95969b8845250586f269e81e5dfdd8ff828ddeb8567a4a2eaa7313460c4"}, + {file = "coverage-6.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:1ef221513e6f68b69ee9e159506d583d31aa3567e0ae84eaad9d6ec1107dddaa"}, + {file = "coverage-6.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cca4435eebea7962a52bdb216dec27215d0df64cf27fc1dd538415f5d2b9da6b"}, + {file = "coverage-6.5.0-cp311-cp311-win32.whl", hash = "sha256:98e8a10b7a314f454d9eff4216a9a94d143a7ee65018dd12442e898ee2310578"}, + {file = "coverage-6.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:bc8ef5e043a2af066fa8cbfc6e708d58017024dc4345a1f9757b329a249f041b"}, + {file = "coverage-6.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4433b90fae13f86fafff0b326453dd42fc9a639a0d9e4eec4d366436d1a41b6d"}, + {file = "coverage-6.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4f05d88d9a80ad3cac6244d36dd89a3c00abc16371769f1340101d3cb899fc3"}, + {file = "coverage-6.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:94e2565443291bd778421856bc975d351738963071e9b8839ca1fc08b42d4bef"}, + {file = "coverage-6.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79"}, + {file = "coverage-6.5.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:255758a1e3b61db372ec2736c8e2a1fdfaf563977eedbdf131de003ca5779b7d"}, + {file = "coverage-6.5.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:851cf4ff24062c6aec510a454b2584f6e998cada52d4cb58c5e233d07172e50c"}, + {file = "coverage-6.5.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:12adf310e4aafddc58afdb04d686795f33f4d7a6fa67a7a9d4ce7d6ae24d949f"}, + {file = "coverage-6.5.0-cp37-cp37m-win32.whl", hash = "sha256:b5604380f3415ba69de87a289a2b56687faa4fe04dbee0754bfcae433489316b"}, + {file = "coverage-6.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:4a8dbc1f0fbb2ae3de73eb0bdbb914180c7abfbf258e90b311dcd4f585d44bd2"}, + {file = "coverage-6.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d900bb429fdfd7f511f868cedd03a6bbb142f3f9118c09b99ef8dc9bf9643c3c"}, + {file = "coverage-6.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2198ea6fc548de52adc826f62cb18554caedfb1d26548c1b7c88d8f7faa8f6ba"}, + {file = "coverage-6.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c4459b3de97b75e3bd6b7d4b7f0db13f17f504f3d13e2a7c623786289dd670e"}, + {file = "coverage-6.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20c8ac5386253717e5ccc827caad43ed66fea0efe255727b1053a8154d952398"}, + {file = "coverage-6.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b07130585d54fe8dff3d97b93b0e20290de974dc8177c320aeaf23459219c0b"}, + {file = "coverage-6.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dbdb91cd8c048c2b09eb17713b0c12a54fbd587d79adcebad543bc0cd9a3410b"}, + {file = "coverage-6.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:de3001a203182842a4630e7b8d1a2c7c07ec1b45d3084a83d5d227a3806f530f"}, + {file = "coverage-6.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e07f4a4a9b41583d6eabec04f8b68076ab3cd44c20bd29332c6572dda36f372e"}, + {file = "coverage-6.5.0-cp38-cp38-win32.whl", hash = "sha256:6d4817234349a80dbf03640cec6109cd90cba068330703fa65ddf56b60223a6d"}, + {file = "coverage-6.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:7ccf362abd726b0410bf8911c31fbf97f09f8f1061f8c1cf03dfc4b6372848f6"}, + {file = "coverage-6.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:633713d70ad6bfc49b34ead4060531658dc6dfc9b3eb7d8a716d5873377ab745"}, + {file = "coverage-6.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:95203854f974e07af96358c0b261f1048d8e1083f2de9b1c565e1be4a3a48cfc"}, + {file = "coverage-6.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9023e237f4c02ff739581ef35969c3739445fb059b060ca51771e69101efffe"}, + {file = "coverage-6.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:265de0fa6778d07de30bcf4d9dc471c3dc4314a23a3c6603d356a3c9abc2dfcf"}, + {file = "coverage-6.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f830ed581b45b82451a40faabb89c84e1a998124ee4212d440e9c6cf70083e5"}, + {file = "coverage-6.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7b6be138d61e458e18d8e6ddcddd36dd96215edfe5f1168de0b1b32635839b62"}, + {file = "coverage-6.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:42eafe6778551cf006a7c43153af1211c3aaab658d4d66fa5fcc021613d02518"}, + {file = "coverage-6.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:723e8130d4ecc8f56e9a611e73b31219595baa3bb252d539206f7bbbab6ffc1f"}, + {file = "coverage-6.5.0-cp39-cp39-win32.whl", hash = "sha256:d9ecf0829c6a62b9b573c7bb6d4dcd6ba8b6f80be9ba4fc7ed50bf4ac9aecd72"}, + {file = "coverage-6.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc2af30ed0d5ae0b1abdb4ebdce598eafd5b35397d4d75deb341a614d333d987"}, + {file = "coverage-6.5.0-pp36.pp37.pp38-none-any.whl", hash = "sha256:1431986dac3923c5945271f169f59c45b8802a114c8f548d611f2015133df77a"}, + {file = "coverage-6.5.0.tar.gz", hash = "sha256:f642e90754ee3e06b0e7e51bce3379590e76b7f76b708e1a71ff043f87025c84"}, +] +cycler = [ + {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"}, + {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"}, +] +fonttools = [ + {file = "fonttools-4.37.4-py3-none-any.whl", hash = "sha256:afae1b39555f9c3f0ad1f0f1daf678e5ad157e38c8842ecb567951bf1a9b9fd7"}, + {file = "fonttools-4.37.4.zip", hash = "sha256:86918c150c6412798e15a0de6c3e0d061ddefddd00f97b4f7b43dfa867ad315e"}, +] +idna = [ + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, +] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] -jq = [] -kiwisolver = [] -matplotlib = [] -memory-profiler = [] -numpy = [] -packaging = [] -pandas = [] -pandas-schema = [] -pgzip = [] -pillow = [] +jq = [ + {file = "jq-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4edd3edde1bec54408dc14a4ab4350523e2d69c15cac58033459cff3ff814ca9"}, + {file = "jq-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11d6e7c98efcb38b95481687310ed0ec7f230cc8916ac1c054827552f45ece5d"}, + {file = "jq-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c4e9d9cdf5e63bddeea01d1150903dcb45c3fbd028ecf4578561b0b6dddb8dc"}, + {file = "jq-1.3.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f36e1d4837383489b0e0c2c70da06c2cb6332bca4fa864b0c7a1500abfed384a"}, + {file = "jq-1.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75fd74d74e0f78f513fd8517d39fee591b6dc194b422e5b2d88b77899a79a2a9"}, + {file = "jq-1.3.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:05d44bdf6a97ad4866ad099fed5297214a58e1e3e56657be41495c69ce74aa11"}, + {file = "jq-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:18bb7c70fc01f5e4739b1ac3cbab958eae881b1991615c973b9cdcfcd06ce93f"}, + {file = "jq-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06d02868ec10cf1e744fda3ec1b6928eb2a05fbbb3b52383c85285d52e8e000a"}, + {file = "jq-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:228c42e825007277fd01ad71f6ea93bda289fe45693b2d3649eeb87b75a6b083"}, + {file = "jq-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:118070043a369378feadf22f091e57b4054b3325ccdc248e1f19c8586bfe9e6f"}, + {file = "jq-1.3.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fa0a8032416746edb57ea570cafc99026c2c240d8a0624c0a43631fc82cc0efa"}, + {file = "jq-1.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:56ea972fd697aea1658c56bd18da9d2ae2494ae9be627858a5152d70cdea75f2"}, + {file = "jq-1.3.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:150aeaa02d57fc9c4966c7af8c6c2a9c6736f82a798c7bc72b44dd7970f1f8d8"}, + {file = "jq-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78d13cbeb6dd512adcf2bbebf393c18e3c0b7e3e366ce7a05b46310ce2924a65"}, + {file = "jq-1.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:28c6ec10c97e697ca73d1fe188c340d53ad17fc4b42fcca87570c060f9364c55"}, + {file = "jq-1.3.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16e28f6964b827a5170a0d7f086367601ef1d420952de81934e79cff638a6bc8"}, + {file = "jq-1.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d8c8a0e7bc6f738e482472b4fbb6be97eab707924661782e90c378327688b29"}, + {file = "jq-1.3.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0dc8ca9e2ea1afc8e856e92eee70f6ab728342d1c071c741b2047d5c08961d8a"}, + {file = "jq-1.3.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:fa7724caaedf6c48522a6805e4306978defc5f0d6bd6ab0d25eeb7207dae9a90"}, + {file = "jq-1.3.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:e97578a5e53b06481dbdf5b88e38e6dd856b53ba7bd68678ceadb50de3799fd1"}, + {file = "jq-1.3.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:673ec567be3644166a367627e8393944ce2d21b2e14dc1f4be48b2e3082ccfc2"}, + {file = "jq-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fe9fb7748e39b30b5723a7f6695e2f6c58707563b94f1239766d2c8b9550f147"}, + {file = "jq-1.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ead1c073cf33326be15a879475e3dd8b5f5a70ccff95acafbba56ea3d231803d"}, + {file = "jq-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e97850b803ea8604cd14645a63ad5863d86fdbfaf0bede0da0490c3108956bd0"}, + {file = "jq-1.3.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ae2b94979bcd6a183aa18056bce975d5081199d83cbf914ff71aecab4e0a2d4"}, + {file = "jq-1.3.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:6b07ad65aa9fca72b7e722585bac5a5e13280257abe9e9d5af845a003e66ff8b"}, + {file = "jq-1.3.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:028604640ba281c2bc5fa1b4284078d4d56d9431c96b6f71715fd801a3d2cdfb"}, + {file = "jq-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:95160aa31efde7939763c80767e85d9a34544ab87e7ed66ee13f42dbc4595d37"}, + {file = "jq-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d49afd3743d9703d31ed2148d9c5d8edc7bf1c894688f4c72b0d9ef5e0add7b3"}, + {file = "jq-1.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec2255c7fe76070a8ee52cc3d32a7cd7f84fca9a6048873c01b07ca1c70db295"}, + {file = "jq-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd04032736f7b94d1e0720634038cc58980f5eeac7e280bbf5f70e03c7f1afd9"}, + {file = "jq-1.3.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d263927eb8140c174e715da2ad8c009e65e7d3fa82b8af176744b9367aae860"}, + {file = "jq-1.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:980ac3ad4fa9e61bbec4e17d0d01c7bad11decca49620d9766aff047a4043eb9"}, + {file = "jq-1.3.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5952b363fd44e4c2e1ceb8a88c0205699ee4eb060c34e1b590f39f3490533549"}, + {file = "jq-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dab5daca625ba4c10c3579d55fe057b063ad609f0c0382ae6e28936d7ec8a772"}, + {file = "jq-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:52ffba73a294949dfddef52a70672e3d5bc694f9e593d3cbcc115449c735e26c"}, + {file = "jq-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c09caae5a24a3216a1bb812489e1ad6bc0e2b7410853c930c272a4ab65c8667"}, + {file = "jq-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1569aa11654a686be9323c9c4329d4a961dd96f9f0b9800eb3bdd939b017cb8a"}, + {file = "jq-1.3.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb28afd2e43e1ece66618381b646ff1da65fc129f0d9cc600ec224f384add061"}, + {file = "jq-1.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bbf73f553e23f99c33463041dc04de3b5a5eefdfc35bd9e2e33ba4f7b060249e"}, + {file = "jq-1.3.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4bc0680f7b4cff6a9077a288e6e166a327eb916a50041cf10d9d3df289b1e5d1"}, + {file = "jq-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0015b80570b38176f5ac22049b957d4f34ee8030f6ee89955f95bce4bfff8451"}, + {file = "jq-1.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:4b46fe1f91423699337c5f3b1001943b911bb54a9e045a1b8d6493142f4acc45"}, + {file = "jq-1.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71d5725dae1dd2f636fb696865a10f260610a23ea029ca7b78f2b7c033b3b148"}, + {file = "jq-1.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e874298835c55e60bbf27bc21799ec64bc1305a0b1dcc4646f2755210fe43bfa"}, + {file = "jq-1.3.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba4afcb00c1fc8d1f0d79f0029345f91e207909fd2c125aeedbf7b4cf2c1fc84"}, + {file = "jq-1.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b3352b273e17e530de5cfbf55cdba68d6231cdf0ac10ca6baf438e41fd43e881"}, + {file = "jq-1.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96da9428a158fa377ec1f432e4790ce3e0640e28004aa1cb6486db374277c6a1"}, + {file = "jq-1.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b1ad8a91623fa3c5cecff9dc398eebf7e5e82ba9982f60e5ef17172d78a4ecc"}, + {file = "jq-1.3.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c427bb8f782eb4ac0326b9f65d5f002bd9a5947d9bc715e662cda659361c48c"}, + {file = "jq-1.3.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:52cce695dafc461becee6d14889918db7cccafd6f3c28e9174042caa7e017e96"}, + {file = "jq-1.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee0503d45e8b24dd4757460e7664c275aed9a6e5f4986b16a483dc11fb83c19b"}, + {file = "jq-1.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c7b8f71778fb4140e923bb716caef0709e6f64d33ea369236f1d43c76d62f7a"}, + {file = "jq-1.3.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bb72811aa78cf0f82d512731c47e69611168909b991b43481d2a6dc0da940835"}, + {file = "jq-1.3.0.tar.gz", hash = "sha256:96b66f41a91c9794f8051cc32d8fd3206c6409693f0076b22eacb4faa0bc504f"}, +] +kiwisolver = [ + {file = "kiwisolver-1.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2f5e60fabb7343a836360c4f0919b8cd0d6dbf08ad2ca6b9cf90bf0c76a3c4f6"}, + {file = "kiwisolver-1.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:10ee06759482c78bdb864f4109886dff7b8a56529bc1609d4f1112b93fe6423c"}, + {file = "kiwisolver-1.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c79ebe8f3676a4c6630fd3f777f3cfecf9289666c84e775a67d1d358578dc2e3"}, + {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:abbe9fa13da955feb8202e215c4018f4bb57469b1b78c7a4c5c7b93001699938"}, + {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7577c1987baa3adc4b3c62c33bd1118c3ef5c8ddef36f0f2c950ae0b199e100d"}, + {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8ad8285b01b0d4695102546b342b493b3ccc6781fc28c8c6a1bb63e95d22f09"}, + {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ed58b8acf29798b036d347791141767ccf65eee7f26bde03a71c944449e53de"}, + {file = "kiwisolver-1.4.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a68b62a02953b9841730db7797422f983935aeefceb1679f0fc85cbfbd311c32"}, + {file = "kiwisolver-1.4.4-cp310-cp310-win32.whl", hash = "sha256:e92a513161077b53447160b9bd8f522edfbed4bd9759e4c18ab05d7ef7e49408"}, + {file = "kiwisolver-1.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:3fe20f63c9ecee44560d0e7f116b3a747a5d7203376abeea292ab3152334d004"}, + {file = "kiwisolver-1.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e0ea21f66820452a3f5d1655f8704a60d66ba1191359b96541eaf457710a5fc6"}, + {file = "kiwisolver-1.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bc9db8a3efb3e403e4ecc6cd9489ea2bac94244f80c78e27c31dcc00d2790ac2"}, + {file = "kiwisolver-1.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d5b61785a9ce44e5a4b880272baa7cf6c8f48a5180c3e81c59553ba0cb0821ca"}, + {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c2dbb44c3f7e6c4d3487b31037b1bdbf424d97687c1747ce4ff2895795c9bf69"}, + {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6295ecd49304dcf3bfbfa45d9a081c96509e95f4b9d0eb7ee4ec0530c4a96514"}, + {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bd472dbe5e136f96a4b18f295d159d7f26fd399136f5b17b08c4e5f498cd494"}, + {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bf7d9fce9bcc4752ca4a1b80aabd38f6d19009ea5cbda0e0856983cf6d0023f5"}, + {file = "kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d6601aed50c74e0ef02f4204da1816147a6d3fbdc8b3872d263338a9052c51"}, + {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:877272cf6b4b7e94c9614f9b10140e198d2186363728ed0f701c6eee1baec1da"}, + {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:db608a6757adabb32f1cfe6066e39b3706d8c3aa69bbc353a5b61edad36a5cb4"}, + {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:5853eb494c71e267912275e5586fe281444eb5e722de4e131cddf9d442615626"}, + {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:f0a1dbdb5ecbef0d34eb77e56fcb3e95bbd7e50835d9782a45df81cc46949750"}, + {file = "kiwisolver-1.4.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:283dffbf061a4ec60391d51e6155e372a1f7a4f5b15d59c8505339454f8989e4"}, + {file = "kiwisolver-1.4.4-cp311-cp311-win32.whl", hash = "sha256:d06adcfa62a4431d404c31216f0f8ac97397d799cd53800e9d3efc2fbb3cf14e"}, + {file = "kiwisolver-1.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:e7da3fec7408813a7cebc9e4ec55afed2d0fd65c4754bc376bf03498d4e92686"}, + {file = "kiwisolver-1.4.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:62ac9cc684da4cf1778d07a89bf5f81b35834cb96ca523d3a7fb32509380cbf6"}, + {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41dae968a94b1ef1897cb322b39360a0812661dba7c682aa45098eb8e193dbdf"}, + {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02f79693ec433cb4b5f51694e8477ae83b3205768a6fb48ffba60549080e295b"}, + {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0611a0a2a518464c05ddd5a3a1a0e856ccc10e67079bb17f265ad19ab3c7597"}, + {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:db5283d90da4174865d520e7366801a93777201e91e79bacbac6e6927cbceede"}, + {file = "kiwisolver-1.4.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1041feb4cda8708ce73bb4dcb9ce1ccf49d553bf87c3954bdfa46f0c3f77252c"}, + {file = "kiwisolver-1.4.4-cp37-cp37m-win32.whl", hash = "sha256:a553dadda40fef6bfa1456dc4be49b113aa92c2a9a9e8711e955618cd69622e3"}, + {file = "kiwisolver-1.4.4-cp37-cp37m-win_amd64.whl", hash = "sha256:03baab2d6b4a54ddbb43bba1a3a2d1627e82d205c5cf8f4c924dc49284b87166"}, + {file = "kiwisolver-1.4.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:841293b17ad704d70c578f1f0013c890e219952169ce8a24ebc063eecf775454"}, + {file = "kiwisolver-1.4.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f4f270de01dd3e129a72efad823da90cc4d6aafb64c410c9033aba70db9f1ff0"}, + {file = "kiwisolver-1.4.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f9f39e2f049db33a908319cf46624a569b36983c7c78318e9726a4cb8923b26c"}, + {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c97528e64cb9ebeff9701e7938653a9951922f2a38bd847787d4a8e498cc83ae"}, + {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d1573129aa0fd901076e2bfb4275a35f5b7aa60fbfb984499d661ec950320b0"}, + {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ad881edc7ccb9d65b0224f4e4d05a1e85cf62d73aab798943df6d48ab0cd79a1"}, + {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b428ef021242344340460fa4c9185d0b1f66fbdbfecc6c63eff4b7c29fad429d"}, + {file = "kiwisolver-1.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2e407cb4bd5a13984a6c2c0fe1845e4e41e96f183e5e5cd4d77a857d9693494c"}, + {file = "kiwisolver-1.4.4-cp38-cp38-win32.whl", hash = "sha256:75facbe9606748f43428fc91a43edb46c7ff68889b91fa31f53b58894503a191"}, + {file = "kiwisolver-1.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:5bce61af018b0cb2055e0e72e7d65290d822d3feee430b7b8203d8a855e78766"}, + {file = "kiwisolver-1.4.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8c808594c88a025d4e322d5bb549282c93c8e1ba71b790f539567932722d7bd8"}, + {file = "kiwisolver-1.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f0a71d85ecdd570ded8ac3d1c0f480842f49a40beb423bb8014539a9f32a5897"}, + {file = "kiwisolver-1.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b533558eae785e33e8c148a8d9921692a9fe5aa516efbdff8606e7d87b9d5824"}, + {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:efda5fc8cc1c61e4f639b8067d118e742b812c930f708e6667a5ce0d13499e29"}, + {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7c43e1e1206cd421cd92e6b3280d4385d41d7166b3ed577ac20444b6995a445f"}, + {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc8d3bd6c72b2dd9decf16ce70e20abcb3274ba01b4e1c96031e0c4067d1e7cd"}, + {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4ea39b0ccc4f5d803e3337dd46bcce60b702be4d86fd0b3d7531ef10fd99a1ac"}, + {file = "kiwisolver-1.4.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968f44fdbf6dd757d12920d63b566eeb4d5b395fd2d00d29d7ef00a00582aac9"}, + {file = "kiwisolver-1.4.4-cp39-cp39-win32.whl", hash = "sha256:da7e547706e69e45d95e116e6939488d62174e033b763ab1496b4c29b76fabea"}, + {file = "kiwisolver-1.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:ba59c92039ec0a66103b1d5fe588fa546373587a7d68f5c96f743c3396afc04b"}, + {file = "kiwisolver-1.4.4-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:91672bacaa030f92fc2f43b620d7b337fd9a5af28b0d6ed3f77afc43c4a64b5a"}, + {file = "kiwisolver-1.4.4-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:787518a6789009c159453da4d6b683f468ef7a65bbde796bcea803ccf191058d"}, + {file = "kiwisolver-1.4.4-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da152d8cdcab0e56e4f45eb08b9aea6455845ec83172092f09b0e077ece2cf7a"}, + {file = "kiwisolver-1.4.4-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:ecb1fa0db7bf4cff9dac752abb19505a233c7f16684c5826d1f11ebd9472b871"}, + {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:28bc5b299f48150b5f822ce68624e445040595a4ac3d59251703779836eceff9"}, + {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:81e38381b782cc7e1e46c4e14cd997ee6040768101aefc8fa3c24a4cc58e98f8"}, + {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2a66fdfb34e05b705620dd567f5a03f239a088d5a3f321e7b6ac3239d22aa286"}, + {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:872b8ca05c40d309ed13eb2e582cab0c5a05e81e987ab9c521bf05ad1d5cf5cb"}, + {file = "kiwisolver-1.4.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:70e7c2e7b750585569564e2e5ca9845acfaa5da56ac46df68414f29fea97be9f"}, + {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9f85003f5dfa867e86d53fac6f7e6f30c045673fa27b603c397753bebadc3008"}, + {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e307eb9bd99801f82789b44bb45e9f541961831c7311521b13a6c85afc09767"}, + {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1792d939ec70abe76f5054d3f36ed5656021dcad1322d1cc996d4e54165cef9"}, + {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6cb459eea32a4e2cf18ba5fcece2dbdf496384413bc1bae15583f19e567f3b2"}, + {file = "kiwisolver-1.4.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:36dafec3d6d6088d34e2de6b85f9d8e2324eb734162fba59d2ba9ed7a2043d5b"}, + {file = "kiwisolver-1.4.4.tar.gz", hash = "sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955"}, +] +matplotlib = [ + {file = "matplotlib-3.6.0-cp310-cp310-macosx_10_12_universal2.whl", hash = "sha256:6b98e098549d3aea2bfb93f38f0b2ecadcb423fa1504bbff902c01efdd833fd8"}, + {file = "matplotlib-3.6.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:798559837156b8e2e2df97cffca748c5c1432af6ec5004c2932e475d813f1743"}, + {file = "matplotlib-3.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e572c67958f7d55eae77f5f64dc7bd31968cc9f24c233926833efe63c60545f2"}, + {file = "matplotlib-3.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ec2edf7f74829eae287aa53d64d83ad5d43ee51d29fb1d88e689d8b36028312"}, + {file = "matplotlib-3.6.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:51092d13499be72e47c15c3a1ae0209edaca6be42b65ffbbefbe0c85f6153c6f"}, + {file = "matplotlib-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9295ca10a140c21e40d2ee43ef423213dc20767f6cea6b87c36973564bc51095"}, + {file = "matplotlib-3.6.0-cp310-cp310-win32.whl", hash = "sha256:1a4835c177821f3729be27ae9be7b8ae209fe75e83db7d9b2bfd319a998f0a42"}, + {file = "matplotlib-3.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:2b60d4abcb6a405ca7d909c80791b00637d22c62aa3bb0ffff7e589f763867f5"}, + {file = "matplotlib-3.6.0-cp311-cp311-macosx_10_12_universal2.whl", hash = "sha256:66a0db13f77aa7806dba29273874cf862450c61c2e5158245d17ee85d983fe8e"}, + {file = "matplotlib-3.6.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:1739935d293d0348d7bf662e8cd0edb9c2aa8f20ccd646db755ce0f3456d24e4"}, + {file = "matplotlib-3.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1559213b803959a2b8309122585b5226d1c2fb66c933b1a2094cf1e99cb4fb90"}, + {file = "matplotlib-3.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5bd3b3ff191f81509d9a1afd62e1e3cda7a7889c35b5b6359a1241fe1511015"}, + {file = "matplotlib-3.6.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1954d71cdf15c19e7f3bf2235a4fe1600ba42f34d472c9495bcf54d75a43e4e"}, + {file = "matplotlib-3.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d840712f4b4c7d2a119f993d7e43ca9bcaa73aeaa24c322fa2bdf4f689a3ee09"}, + {file = "matplotlib-3.6.0-cp311-cp311-win32.whl", hash = "sha256:89e1978c3fbe4e3d4c6ad7db7e6f982607cb2546f982ccbe42708392437b1972"}, + {file = "matplotlib-3.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:9711ef291e184b5a73c9d3af3f2d5cfe25d571c8dd95aa498415f74ac7e221a8"}, + {file = "matplotlib-3.6.0-cp38-cp38-macosx_10_12_universal2.whl", hash = "sha256:fbbceb0a0dfe9213f6314510665a32ef25fe29b50657567cd00115fbfcb3b20d"}, + {file = "matplotlib-3.6.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:62319d57dab5ad3e3494dd97a214e22079d3f72a0c8a2fd001829c2c6abbf8d1"}, + {file = "matplotlib-3.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:140316427a7c384e3dd37efb3a73cd67e14b0b237a6d277def91227f43cdcec2"}, + {file = "matplotlib-3.6.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ccea337fb9a44866c5300c594b13d4d87e827ebc3c353bff15d298bac976b654"}, + {file = "matplotlib-3.6.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:16a899b958dd76606b571bc7eaa38f09160c27dfb262e493584644cfd4a77f0f"}, + {file = "matplotlib-3.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd73a16a759865831be5a8fb6546f2a908c8d7d7f55c75f94ee7c2ca13cc95de"}, + {file = "matplotlib-3.6.0-cp38-cp38-win32.whl", hash = "sha256:2ed779a896b70c8012fe301fb91ee37e713e1dda1eb8f37de04cdbf506706983"}, + {file = "matplotlib-3.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:eca6f59cd0729edaeaa7032d582dffce518a420d4961ef3e8c93dce86be352c3"}, + {file = "matplotlib-3.6.0-cp39-cp39-macosx_10_12_universal2.whl", hash = "sha256:408bbf968c15e9e38df9f25a588e372e28a43240cf5884c9bc6039a5021b7d5b"}, + {file = "matplotlib-3.6.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:7127e2b94571318531caf098dc9e8f60f5aba1704600f0b2483bf151d535674a"}, + {file = "matplotlib-3.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f0d5b9b14ccc7f539143ac9eb1c6b57d26d69ca52d30c3d719a7bc4123579e44"}, + {file = "matplotlib-3.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baa19508d8445f5648cd1ffe4fc6d4f7daf8b876f804e9a453df6c3708f6200b"}, + {file = "matplotlib-3.6.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ae1b9b555212c1e242666af80e7ed796705869581e2d749971db4e682ccc1f3"}, + {file = "matplotlib-3.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0958fc3fdc59c1b716ee1a5d14e73d03d541d873241a37c5c3a86f7ef6017923"}, + {file = "matplotlib-3.6.0-cp39-cp39-win32.whl", hash = "sha256:efe9e8037b989b14bb1887089ae763385431cc06fe488406413079cfd2a3a089"}, + {file = "matplotlib-3.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b0320f882214f6ffde5992081520b57b55450510bdaa020e96aacff9b7ae10e6"}, + {file = "matplotlib-3.6.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:11c1987b803cc2b26725659cfe817478f0a9597878e5c4bf374cfe4e12cbbd79"}, + {file = "matplotlib-3.6.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:802feae98addb9f21707649a7f229c90a59fad34511881f20b906a5e8e6ea475"}, + {file = "matplotlib-3.6.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efd2e12f8964f8fb4ba1984df71d85d02ef0531e687e59f78ec8fc07271a3857"}, + {file = "matplotlib-3.6.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4eba6972b796d97c8fcc5266b6dc42ef27c2dce4421b846cded0f3af851b81c9"}, + {file = "matplotlib-3.6.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:df26a09d955b3ab9b6bc18658b9403ed839096c97d7abe8806194e228a485a3c"}, + {file = "matplotlib-3.6.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e01382c06ac3710155a0ca923047c5abe03c676d08f03e146c6a240d0a910713"}, + {file = "matplotlib-3.6.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4699bb671dbc4afdb544eb893e4deb8a34e294b7734733f65b4fd2787ba5fbc6"}, + {file = "matplotlib-3.6.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:657fb7712185f82211170ac4debae0800ed4f5992b8f7ebba2a9eabaf133a857"}, + {file = "matplotlib-3.6.0.tar.gz", hash = "sha256:c5108ebe67da60a9204497d8d403316228deb52b550388190c53a57394d41531"}, +] +memory-profiler = [ + {file = "memory_profiler-0.60.0.tar.gz", hash = "sha256:6a12869511d6cebcb29b71ba26985675a58e16e06b3c523b49f67c5497a33d1c"}, +] +numpy = [ + {file = "numpy-1.23.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c9f707b5bb73bf277d812ded9896f9512a43edff72712f31667d0a8c2f8e71ee"}, + {file = "numpy-1.23.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffcf105ecdd9396e05a8e58e81faaaf34d3f9875f137c7372450baa5d77c9a54"}, + {file = "numpy-1.23.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ea3f98a0ffce3f8f57675eb9119f3f4edb81888b6874bc1953f91e0b1d4f440"}, + {file = "numpy-1.23.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004f0efcb2fe1c0bd6ae1fcfc69cc8b6bf2407e0f18be308612007a0762b4089"}, + {file = "numpy-1.23.3-cp310-cp310-win32.whl", hash = "sha256:98dcbc02e39b1658dc4b4508442a560fe3ca5ca0d989f0df062534e5ca3a5c1a"}, + {file = "numpy-1.23.3-cp310-cp310-win_amd64.whl", hash = "sha256:39a664e3d26ea854211867d20ebcc8023257c1800ae89773cbba9f9e97bae036"}, + {file = "numpy-1.23.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1f27b5322ac4067e67c8f9378b41c746d8feac8bdd0e0ffede5324667b8a075c"}, + {file = "numpy-1.23.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ad3ec9a748a8943e6eb4358201f7e1c12ede35f510b1a2221b70af4bb64295c"}, + {file = "numpy-1.23.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdc9febce3e68b697d931941b263c59e0c74e8f18861f4064c1f712562903411"}, + {file = "numpy-1.23.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:301c00cf5e60e08e04d842fc47df641d4a181e651c7135c50dc2762ffe293dbd"}, + {file = "numpy-1.23.3-cp311-cp311-win32.whl", hash = "sha256:7cd1328e5bdf0dee621912f5833648e2daca72e3839ec1d6695e91089625f0b4"}, + {file = "numpy-1.23.3-cp311-cp311-win_amd64.whl", hash = "sha256:8355fc10fd33a5a70981a5b8a0de51d10af3688d7a9e4a34fcc8fa0d7467bb7f"}, + {file = "numpy-1.23.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc6e8da415f359b578b00bcfb1d08411c96e9a97f9e6c7adada554a0812a6cc6"}, + {file = "numpy-1.23.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:22d43376ee0acd547f3149b9ec12eec2f0ca4a6ab2f61753c5b29bb3e795ac4d"}, + {file = "numpy-1.23.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a64403f634e5ffdcd85e0b12c08f04b3080d3e840aef118721021f9b48fc1460"}, + {file = "numpy-1.23.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efd9d3abe5774404becdb0748178b48a218f1d8c44e0375475732211ea47c67e"}, + {file = "numpy-1.23.3-cp38-cp38-win32.whl", hash = "sha256:f8c02ec3c4c4fcb718fdf89a6c6f709b14949408e8cf2a2be5bfa9c49548fd85"}, + {file = "numpy-1.23.3-cp38-cp38-win_amd64.whl", hash = "sha256:e868b0389c5ccfc092031a861d4e158ea164d8b7fdbb10e3b5689b4fc6498df6"}, + {file = "numpy-1.23.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:09f6b7bdffe57fc61d869a22f506049825d707b288039d30f26a0d0d8ea05164"}, + {file = "numpy-1.23.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8c79d7cf86d049d0c5089231a5bcd31edb03555bd93d81a16870aa98c6cfb79d"}, + {file = "numpy-1.23.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d5420053bbb3dd64c30e58f9363d7a9c27444c3648e61460c1237f9ec3fa14"}, + {file = "numpy-1.23.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5422d6a1ea9b15577a9432e26608c73a78faf0b9039437b075cf322c92e98e7"}, + {file = "numpy-1.23.3-cp39-cp39-win32.whl", hash = "sha256:c1ba66c48b19cc9c2975c0d354f24058888cdc674bebadceb3cdc9ec403fb5d1"}, + {file = "numpy-1.23.3-cp39-cp39-win_amd64.whl", hash = "sha256:78a63d2df1d947bd9d1b11d35564c2f9e4b57898aae4626638056ec1a231c40c"}, + {file = "numpy-1.23.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:17c0e467ade9bda685d5ac7f5fa729d8d3e76b23195471adae2d6a6941bd2c18"}, + {file = "numpy-1.23.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91b8d6768a75247026e951dce3b2aac79dc7e78622fc148329135ba189813584"}, + {file = "numpy-1.23.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:94c15ca4e52671a59219146ff584488907b1f9b3fc232622b47e2cf832e94fb8"}, + {file = "numpy-1.23.3.tar.gz", hash = "sha256:51bf49c0cd1d52be0a240aa66f3458afc4b95d8993d2d04f0d91fa60c10af6cd"}, +] +packaging = [ + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, +] +pandas = [ + {file = "pandas-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0d8d7433d19bfa33f11c92ad9997f15a902bda4f5ad3a4814a21d2e910894484"}, + {file = "pandas-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5cc47f2ebaa20ef96ae72ee082f9e101b3dfbf74f0e62c7a12c0b075a683f03c"}, + {file = "pandas-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e8e5edf97d8793f51d258c07c629bd49d271d536ce15d66ac00ceda5c150eb3"}, + {file = "pandas-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41aec9f87455306496d4486df07c1b98c15569c714be2dd552a6124cd9fda88f"}, + {file = "pandas-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c76f1d104844c5360c21d2ef0e1a8b2ccf8b8ebb40788475e255b9462e32b2be"}, + {file = "pandas-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:1642fc6138b4e45d57a12c1b464a01a6d868c0148996af23f72dde8d12486bbc"}, + {file = "pandas-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:171cef540bfcec52257077816a4dbbac152acdb8236ba11d3196ae02bf0959d8"}, + {file = "pandas-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a68a9b9754efff364b0c5ee5b0f18e15ca640c01afe605d12ba8b239ca304d6b"}, + {file = "pandas-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:86d87279ebc5bc20848b4ceb619073490037323f80f515e0ec891c80abad958a"}, + {file = "pandas-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:207d63ac851e60ec57458814613ef4b3b6a5e9f0b33c57623ba2bf8126c311f8"}, + {file = "pandas-1.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e252a9e49b233ff96e2815c67c29702ac3a062098d80a170c506dff3470fd060"}, + {file = "pandas-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:de34636e2dc04e8ac2136a8d3c2051fd56ebe9fd6cd185581259330649e73ca9"}, + {file = "pandas-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1d34b1f43d9e3f4aea056ba251f6e9b143055ebe101ed04c847b41bb0bb4a989"}, + {file = "pandas-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1b82ccc7b093e0a93f8dffd97a542646a3e026817140e2c01266aaef5fdde11b"}, + {file = "pandas-1.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4e30a31039574d96f3d683df34ccb50bb435426ad65793e42a613786901f6761"}, + {file = "pandas-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62e61003411382e20d7c2aec1ee8d7c86c8b9cf46290993dd8a0a3be44daeb38"}, + {file = "pandas-1.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc987f7717e53d372f586323fff441263204128a1ead053c1b98d7288f836ac9"}, + {file = "pandas-1.5.0-cp38-cp38-win32.whl", hash = "sha256:e178ce2d7e3b934cf8d01dc2d48d04d67cb0abfaffdcc8aa6271fd5a436f39c8"}, + {file = "pandas-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:33a9d9e21ab2d91e2ab6e83598419ea6a664efd4c639606b299aae8097c1c94f"}, + {file = "pandas-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:73844e247a7b7dac2daa9df7339ecf1fcf1dfb8cbfd11e3ffe9819ae6c31c515"}, + {file = "pandas-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9c5049333c5bebf993033f4bf807d163e30e8fada06e1da7fa9db86e2392009"}, + {file = "pandas-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:85a516a7f6723ca1528f03f7851fa8d0360d1d6121cf15128b290cf79b8a7f6a"}, + {file = "pandas-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:947ed9f896ee61adbe61829a7ae1ade493c5a28c66366ec1de85c0642009faac"}, + {file = "pandas-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7f38d91f21937fe2bec9449570d7bf36ad7136227ef43b321194ec249e2149d"}, + {file = "pandas-1.5.0-cp39-cp39-win32.whl", hash = "sha256:2504c032f221ef9e4a289f5e46a42b76f5e087ecb67d62e342ccbba95a32a488"}, + {file = "pandas-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:8a4fc04838615bf0a8d3a03ed68197f358054f0df61f390bcc64fbe39e3d71ec"}, + {file = "pandas-1.5.0.tar.gz", hash = "sha256:3ee61b881d2f64dd90c356eb4a4a4de75376586cd3c9341c6c0fcaae18d52977"}, +] +pandas-schema = [ + {file = "pandas_schema-0.3.6-py3-none-any.whl", hash = "sha256:7497621cdf8c191fca1ef6ded9caa6f2153b220f120a2686d921f80c8031994d"}, + {file = "pandas_schema-0.3.6.tar.gz", hash = "sha256:c6bfc52c4bae9cdd7420fbe8c4b0622b769457827c3fc819928405638caf605f"}, +] +pgzip = [ + {file = "pgzip-0.3.2-py3-none-any.whl", hash = "sha256:31557eecb9f8814c1b0933485835598ecf6ba879967fbfa771019962fd96abae"}, + {file = "pgzip-0.3.2.tar.gz", hash = "sha256:49ad5b0b848cb08967076cbdc55d2179bd77551966b993704c6deaaab9eed480"}, +] +pillow = [ + {file = "Pillow-9.2.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:a9c9bc489f8ab30906d7a85afac4b4944a572a7432e00698a7239f44a44e6efb"}, + {file = "Pillow-9.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:510cef4a3f401c246cfd8227b300828715dd055463cdca6176c2e4036df8bd4f"}, + {file = "Pillow-9.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7888310f6214f19ab2b6df90f3f06afa3df7ef7355fc025e78a3044737fab1f5"}, + {file = "Pillow-9.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:831e648102c82f152e14c1a0938689dbb22480c548c8d4b8b248b3e50967b88c"}, + {file = "Pillow-9.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1cc1d2451e8a3b4bfdb9caf745b58e6c7a77d2e469159b0d527a4554d73694d1"}, + {file = "Pillow-9.2.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:136659638f61a251e8ed3b331fc6ccd124590eeff539de57c5f80ef3a9594e58"}, + {file = "Pillow-9.2.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6e8c66f70fb539301e064f6478d7453e820d8a2c631da948a23384865cd95544"}, + {file = "Pillow-9.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:37ff6b522a26d0538b753f0b4e8e164fdada12db6c6f00f62145d732d8a3152e"}, + {file = "Pillow-9.2.0-cp310-cp310-win32.whl", hash = "sha256:c79698d4cd9318d9481d89a77e2d3fcaeff5486be641e60a4b49f3d2ecca4e28"}, + {file = "Pillow-9.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:254164c57bab4b459f14c64e93df11eff5ded575192c294a0c49270f22c5d93d"}, + {file = "Pillow-9.2.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:adabc0bce035467fb537ef3e5e74f2847c8af217ee0be0455d4fec8adc0462fc"}, + {file = "Pillow-9.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:336b9036127eab855beec9662ac3ea13a4544a523ae273cbf108b228ecac8437"}, + {file = "Pillow-9.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50dff9cc21826d2977ef2d2a205504034e3a4563ca6f5db739b0d1026658e004"}, + {file = "Pillow-9.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb6259196a589123d755380b65127ddc60f4c64b21fc3bb46ce3a6ea663659b0"}, + {file = "Pillow-9.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b0554af24df2bf96618dac71ddada02420f946be943b181108cac55a7a2dcd4"}, + {file = "Pillow-9.2.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:15928f824870535c85dbf949c09d6ae7d3d6ac2d6efec80f3227f73eefba741c"}, + {file = "Pillow-9.2.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:bdd0de2d64688ecae88dd8935012c4a72681e5df632af903a1dca8c5e7aa871a"}, + {file = "Pillow-9.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5b87da55a08acb586bad5c3aa3b86505f559b84f39035b233d5bf844b0834b1"}, + {file = "Pillow-9.2.0-cp311-cp311-win32.whl", hash = "sha256:b6d5e92df2b77665e07ddb2e4dbd6d644b78e4c0d2e9272a852627cdba0d75cf"}, + {file = "Pillow-9.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6bf088c1ce160f50ea40764f825ec9b72ed9da25346216b91361eef8ad1b8f8c"}, + {file = "Pillow-9.2.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:2c58b24e3a63efd22554c676d81b0e57f80e0a7d3a5874a7e14ce90ec40d3069"}, + {file = "Pillow-9.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eef7592281f7c174d3d6cbfbb7ee5984a671fcd77e3fc78e973d492e9bf0eb3f"}, + {file = "Pillow-9.2.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dcd7b9c7139dc8258d164b55696ecd16c04607f1cc33ba7af86613881ffe4ac8"}, + {file = "Pillow-9.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a138441e95562b3c078746a22f8fca8ff1c22c014f856278bdbdd89ca36cff1b"}, + {file = "Pillow-9.2.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:93689632949aff41199090eff5474f3990b6823404e45d66a5d44304e9cdc467"}, + {file = "Pillow-9.2.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:f3fac744f9b540148fa7715a435d2283b71f68bfb6d4aae24482a890aed18b59"}, + {file = "Pillow-9.2.0-cp37-cp37m-win32.whl", hash = "sha256:fa768eff5f9f958270b081bb33581b4b569faabf8774726b283edb06617101dc"}, + {file = "Pillow-9.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:69bd1a15d7ba3694631e00df8de65a8cb031911ca11f44929c97fe05eb9b6c1d"}, + {file = "Pillow-9.2.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:030e3460861488e249731c3e7ab59b07c7853838ff3b8e16aac9561bb345da14"}, + {file = "Pillow-9.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:74a04183e6e64930b667d321524e3c5361094bb4af9083db5c301db64cd341f3"}, + {file = "Pillow-9.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d33a11f601213dcd5718109c09a52c2a1c893e7461f0be2d6febc2879ec2402"}, + {file = "Pillow-9.2.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fd6f5e3c0e4697fa7eb45b6e93996299f3feee73a3175fa451f49a74d092b9f"}, + {file = "Pillow-9.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a647c0d4478b995c5e54615a2e5360ccedd2f85e70ab57fbe817ca613d5e63b8"}, + {file = "Pillow-9.2.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:4134d3f1ba5f15027ff5c04296f13328fecd46921424084516bdb1b2548e66ff"}, + {file = "Pillow-9.2.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:bc431b065722a5ad1dfb4df354fb9333b7a582a5ee39a90e6ffff688d72f27a1"}, + {file = "Pillow-9.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1536ad017a9f789430fb6b8be8bf99d2f214c76502becc196c6f2d9a75b01b76"}, + {file = "Pillow-9.2.0-cp38-cp38-win32.whl", hash = "sha256:2ad0d4df0f5ef2247e27fc790d5c9b5a0af8ade9ba340db4a73bb1a4a3e5fb4f"}, + {file = "Pillow-9.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:ec52c351b35ca269cb1f8069d610fc45c5bd38c3e91f9ab4cbbf0aebc136d9c8"}, + {file = "Pillow-9.2.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ed2c4ef2451de908c90436d6e8092e13a43992f1860275b4d8082667fbb2ffc"}, + {file = "Pillow-9.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ad2f835e0ad81d1689f1b7e3fbac7b01bb8777d5a985c8962bedee0cc6d43da"}, + {file = "Pillow-9.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea98f633d45f7e815db648fd7ff0f19e328302ac36427343e4432c84432e7ff4"}, + {file = "Pillow-9.2.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7761afe0126d046974a01e030ae7529ed0ca6a196de3ec6937c11df0df1bc91c"}, + {file = "Pillow-9.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a54614049a18a2d6fe156e68e188da02a046a4a93cf24f373bffd977e943421"}, + {file = "Pillow-9.2.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:5aed7dde98403cd91d86a1115c78d8145c83078e864c1de1064f52e6feb61b20"}, + {file = "Pillow-9.2.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:13b725463f32df1bfeacbf3dd197fb358ae8ebcd8c5548faa75126ea425ccb60"}, + {file = "Pillow-9.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:808add66ea764ed97d44dda1ac4f2cfec4c1867d9efb16a33d158be79f32b8a4"}, + {file = "Pillow-9.2.0-cp39-cp39-win32.whl", hash = "sha256:337a74fd2f291c607d220c793a8135273c4c2ab001b03e601c36766005f36885"}, + {file = "Pillow-9.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:fac2d65901fb0fdf20363fbd345c01958a742f2dc62a8dd4495af66e3ff502a4"}, + {file = "Pillow-9.2.0-pp37-pypy37_pp73-macosx_10_10_x86_64.whl", hash = "sha256:ad2277b185ebce47a63f4dc6302e30f05762b688f8dc3de55dbae4651872cdf3"}, + {file = "Pillow-9.2.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c7b502bc34f6e32ba022b4a209638f9e097d7a9098104ae420eb8186217ebbb"}, + {file = "Pillow-9.2.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d1f14f5f691f55e1b47f824ca4fdcb4b19b4323fe43cc7bb105988cad7496be"}, + {file = "Pillow-9.2.0-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:dfe4c1fedfde4e2fbc009d5ad420647f7730d719786388b7de0999bf32c0d9fd"}, + {file = "Pillow-9.2.0-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:f07f1f00e22b231dd3d9b9208692042e29792d6bd4f6639415d2f23158a80013"}, + {file = "Pillow-9.2.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1802f34298f5ba11d55e5bb09c31997dc0c6aed919658dfdf0198a2fe75d5490"}, + {file = "Pillow-9.2.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17d4cafe22f050b46d983b71c707162d63d796a1235cdf8b9d7a112e97b15bac"}, + {file = "Pillow-9.2.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:96b5e6874431df16aee0c1ba237574cb6dff1dcb173798faa6a9d8b399a05d0e"}, + {file = "Pillow-9.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:0030fdbd926fb85844b8b92e2f9449ba89607231d3dd597a21ae72dc7fe26927"}, + {file = "Pillow-9.2.0.tar.gz", hash = "sha256:75e636fd3e0fb872693f23ccb8a5ff2cd578801251f3a4f6854c6a5d437d3c04"}, +] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, ] -polars = [] -psutil = [] +polars = [ + {file = "polars-0.14.26-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:04fe75e98aacb2eef284bc648b06527b69fac95a9d5ff81f27b81a5a2c5ff15c"}, + {file = "polars-0.14.26-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:a1324de3345517733440348a39505842a99e6343cb2a2320e883454fdc27469e"}, + {file = "polars-0.14.26-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c253e079d6707cb3734144dca7373d8d21a3c00a789aba9aaea10524cbd9213"}, + {file = "polars-0.14.26-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ea6bac0f0e011e2ea404bad19c4fef819e4b80e4af9c90a7b3a915623d05eb9"}, + {file = "polars-0.14.26-cp37-abi3-win_amd64.whl", hash = "sha256:d8c9f104b538129026ba9414521fd50cc6862ee1ea600ee2f2489fff0f0a1e8a"}, + {file = "polars-0.14.26.tar.gz", hash = "sha256:f13f9580820507bb04bbc3398aa8125a56c1e2b0bdcc20e5ef0fbf2d051e8c06"}, +] +psutil = [ + {file = "psutil-5.9.2-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:8f024fbb26c8daf5d70287bb3edfafa22283c255287cf523c5d81721e8e5d82c"}, + {file = "psutil-5.9.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:b2f248ffc346f4f4f0d747ee1947963613216b06688be0be2e393986fe20dbbb"}, + {file = "psutil-5.9.2-cp27-cp27m-win32.whl", hash = "sha256:b1928b9bf478d31fdffdb57101d18f9b70ed4e9b0e41af751851813547b2a9ab"}, + {file = "psutil-5.9.2-cp27-cp27m-win_amd64.whl", hash = "sha256:404f4816c16a2fcc4eaa36d7eb49a66df2d083e829d3e39ee8759a411dbc9ecf"}, + {file = "psutil-5.9.2-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:94e621c6a4ddb2573d4d30cba074f6d1aa0186645917df42c811c473dd22b339"}, + {file = "psutil-5.9.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:256098b4f6ffea6441eb54ab3eb64db9ecef18f6a80d7ba91549195d55420f84"}, + {file = "psutil-5.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:614337922702e9be37a39954d67fdb9e855981624d8011a9927b8f2d3c9625d9"}, + {file = "psutil-5.9.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39ec06dc6c934fb53df10c1672e299145ce609ff0611b569e75a88f313634969"}, + {file = "psutil-5.9.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3ac2c0375ef498e74b9b4ec56df3c88be43fe56cac465627572dbfb21c4be34"}, + {file = "psutil-5.9.2-cp310-cp310-win32.whl", hash = "sha256:e4c4a7636ffc47b7141864f1c5e7d649f42c54e49da2dd3cceb1c5f5d29bfc85"}, + {file = "psutil-5.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:f4cb67215c10d4657e320037109939b1c1d2fd70ca3d76301992f89fe2edb1f1"}, + {file = "psutil-5.9.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:dc9bda7d5ced744622f157cc8d8bdd51735dafcecff807e928ff26bdb0ff097d"}, + {file = "psutil-5.9.2-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75291912b945a7351d45df682f9644540d564d62115d4a20d45fa17dc2d48f8"}, + {file = "psutil-5.9.2-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4018d5f9b6651f9896c7a7c2c9f4652e4eea53f10751c4e7d08a9093ab587ec"}, + {file = "psutil-5.9.2-cp36-cp36m-win32.whl", hash = "sha256:f40ba362fefc11d6bea4403f070078d60053ed422255bd838cd86a40674364c9"}, + {file = "psutil-5.9.2-cp36-cp36m-win_amd64.whl", hash = "sha256:9770c1d25aee91417eba7869139d629d6328a9422ce1cdd112bd56377ca98444"}, + {file = "psutil-5.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:42638876b7f5ef43cef8dcf640d3401b27a51ee3fa137cb2aa2e72e188414c32"}, + {file = "psutil-5.9.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91aa0dac0c64688667b4285fa29354acfb3e834e1fd98b535b9986c883c2ce1d"}, + {file = "psutil-5.9.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4fb54941aac044a61db9d8eb56fc5bee207db3bc58645d657249030e15ba3727"}, + {file = "psutil-5.9.2-cp37-cp37m-win32.whl", hash = "sha256:7cbb795dcd8ed8fd238bc9e9f64ab188f3f4096d2e811b5a82da53d164b84c3f"}, + {file = "psutil-5.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:5d39e3a2d5c40efa977c9a8dd4f679763c43c6c255b1340a56489955dbca767c"}, + {file = "psutil-5.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd331866628d18223a4265371fd255774affd86244fc307ef66eaf00de0633d5"}, + {file = "psutil-5.9.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b315febaebae813326296872fdb4be92ad3ce10d1d742a6b0c49fb619481ed0b"}, + {file = "psutil-5.9.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7929a516125f62399d6e8e026129c8835f6c5a3aab88c3fff1a05ee8feb840d"}, + {file = "psutil-5.9.2-cp38-cp38-win32.whl", hash = "sha256:561dec454853846d1dd0247b44c2e66a0a0c490f937086930ec4b8f83bf44f06"}, + {file = "psutil-5.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:67b33f27fc0427483b61563a16c90d9f3b547eeb7af0ef1b9fe024cdc9b3a6ea"}, + {file = "psutil-5.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b3591616fa07b15050b2f87e1cdefd06a554382e72866fcc0ab2be9d116486c8"}, + {file = "psutil-5.9.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:14b29f581b5edab1f133563272a6011925401804d52d603c5c606936b49c8b97"}, + {file = "psutil-5.9.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4642fd93785a29353d6917a23e2ac6177308ef5e8be5cc17008d885cb9f70f12"}, + {file = "psutil-5.9.2-cp39-cp39-win32.whl", hash = "sha256:ed29ea0b9a372c5188cdb2ad39f937900a10fb5478dc077283bf86eeac678ef1"}, + {file = "psutil-5.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:68b35cbff92d1f7103d8f1db77c977e72f49fcefae3d3d2b91c76b0e7aef48b8"}, + {file = "psutil-5.9.2.tar.gz", hash = "sha256:feb861a10b6c3bb00701063b37e4afc754f8217f0f09c42280586bd6ac712b5c"}, +] py = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] -pycparser = [] +pycparser = [ + {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, + {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, +] pyliftover = [ {file = "pyliftover-0.4.tar.gz", hash = "sha256:72bcfb7de907569b0eb75e86c817840365297d63ba43a961da394187e399da41"}, ] -pyparsing = [] -pysqlar = [] -pytest = [] -pytest-cov = [] +pyparsing = [ + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] +pysqlar = [ + {file = "pysqlar-0.1.3-py3-none-any.whl", hash = "sha256:d833924746f8c7478eb9070b18d62cbc17f6a12833758bba2258f251433ac9f8"}, + {file = "pysqlar-0.1.3.tar.gz", hash = "sha256:aa9e99601af0961c284b9f1bbadf1e25873fec191c43046f29952061b9db63e0"}, +] +pytest = [ + {file = "pytest-7.1.3-py3-none-any.whl", hash = "sha256:1377bda3466d70b55e3f5cecfa55bb7cfcf219c7964629b967c37cf0bda818b7"}, + {file = "pytest-7.1.3.tar.gz", hash = "sha256:4f365fec2dff9c1162f834d9f18af1ba13062db0c708bf7b946f8a5c76180c39"}, +] +pytest-cov = [ + {file = "pytest-cov-3.0.0.tar.gz", hash = "sha256:e7f0f5b1617d2210a2cabc266dfe2f4c75a8d32fb89eafb7ad9d06f6d076d470"}, + {file = "pytest_cov-3.0.0-py3-none-any.whl", hash = "sha256:578d5d15ac4a25e5f961c938b85a05b09fdaae9deef3bb6de9a6e766622ca7a6"}, +] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, ] -pytz = [] -requests = [] -setuptools-scm = [] +pytz = [ + {file = "pytz-2022.4-py2.py3-none-any.whl", hash = "sha256:2c0784747071402c6e99f0bafdb7da0fa22645f06554c7ae06bf6358897e9c91"}, + {file = "pytz-2022.4.tar.gz", hash = "sha256:48ce799d83b6f8aab2020e369b627446696619e79645419610b9facd909b3174"}, +] +requests = [ + {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"}, + {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"}, +] +setuptools = [ + {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"}, + {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"}, +] +setuptools-scm = [ + {file = "setuptools_scm-7.0.5-py3-none-any.whl", hash = "sha256:7930f720905e03ccd1e1d821db521bff7ec2ac9cf0ceb6552dd73d24a45d3b02"}, + {file = "setuptools_scm-7.0.5.tar.gz", hash = "sha256:031e13af771d6f892b941adb6ea04545bbf91ebc5ce68c78aaf3fff6e1fb4844"}, +] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -531,6 +1114,57 @@ tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] -typing-extensions = [] -urllib3 = [] -zstandard = [] +typing-extensions = [ + {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"}, + {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"}, +] +urllib3 = [ + {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"}, + {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"}, +] +zstandard = [ + {file = "zstandard-0.18.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef7e8a200e4c8ac9102ed3c90ed2aa379f6b880f63032200909c1be21951f556"}, + {file = "zstandard-0.18.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2dc466207016564805e56d28375f4f533b525ff50d6776946980dff5465566ac"}, + {file = "zstandard-0.18.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a2ee1d4f98447f3e5183ecfce5626f983504a4a0c005fbe92e60fa8e5d547ec"}, + {file = "zstandard-0.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d956e2f03c7200d7e61345e0880c292783ec26618d0d921dcad470cb195bbce2"}, + {file = "zstandard-0.18.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ce6f59cba9854fd14da5bfe34217a1501143057313966637b7291d1b0267bd1e"}, + {file = "zstandard-0.18.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7fa67cba473623848b6e88acf8d799b1906178fd883fb3a1da24561c779593b"}, + {file = "zstandard-0.18.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cdb44d7284c8c5dd1b66dfb86dda7f4560fa94bfbbc1d2da749ba44831335e32"}, + {file = "zstandard-0.18.0-cp310-cp310-win32.whl", hash = "sha256:63694a376cde0aa8b1971d06ca28e8f8b5f492779cb6ee1cc46bbc3f019a42a5"}, + {file = "zstandard-0.18.0-cp310-cp310-win_amd64.whl", hash = "sha256:702a8324cd90c74d9c8780d02bf55e79da3193c870c9665ad3a11647e3ad1435"}, + {file = "zstandard-0.18.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:46f679bc5dfd938db4fb058218d9dc4db1336ffaf1ea774ff152ecadabd40805"}, + {file = "zstandard-0.18.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc2a4de9f363b3247d472362a65041fe4c0f59e01a2846b15d13046be866a885"}, + {file = "zstandard-0.18.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd3220d7627fd4d26397211cb3b560ec7cc4a94b75cfce89e847e8ce7fabe32d"}, + {file = "zstandard-0.18.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:39e98cf4773234bd9cebf9f9db730e451dfcfe435e220f8921242afda8321887"}, + {file = "zstandard-0.18.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5228e596eb1554598c872a337bbe4e5afe41cd1f8b1b15f2e35b50d061e35244"}, + {file = "zstandard-0.18.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d4a8fd45746a6c31e729f35196e80b8f1e9987c59f5ccb8859d7c6a6fbeb9c63"}, + {file = "zstandard-0.18.0-cp36-cp36m-win32.whl", hash = "sha256:4cbb85f29a990c2fdbf7bc63246567061a362ddca886d7fae6f780267c0a9e67"}, + {file = "zstandard-0.18.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bfa6c8549fa18e6497a738b7033c49f94a8e2e30c5fbe2d14d0b5aa8bbc1695d"}, + {file = "zstandard-0.18.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e02043297c1832f2666cd2204f381bef43b10d56929e13c42c10c732c6e3b4ed"}, + {file = "zstandard-0.18.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7231543d38d2b7e02ef7cc78ef7ffd86419437e1114ff08709fe25a160e24bd6"}, + {file = "zstandard-0.18.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c86befac87445927488f5c8f205d11566f64c11519db223e9d282b945fa60dab"}, + {file = "zstandard-0.18.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:999a4e1768f219826ba3fa2064fab1c86dd72fdd47a42536235478c3bb3ca3e2"}, + {file = "zstandard-0.18.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9df59cd1cf3c62075ee2a4da767089d19d874ac3ad42b04a71a167e91b384722"}, + {file = "zstandard-0.18.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1be31e9e3f7607ee0cdd60915410a5968b205d3e7aa83b7fcf3dd76dbbdb39e0"}, + {file = "zstandard-0.18.0-cp37-cp37m-win32.whl", hash = "sha256:490d11b705b8ae9dc845431bacc8dd1cef2408aede176620a5cd0cd411027936"}, + {file = "zstandard-0.18.0-cp37-cp37m-win_amd64.whl", hash = "sha256:266aba27fa9cc5e9091d3d325ebab1fa260f64e83e42516d5e73947c70216a5b"}, + {file = "zstandard-0.18.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8b2260c4e07dd0723eadb586de7718b61acca4083a490dda69c5719d79bc715c"}, + {file = "zstandard-0.18.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3af8c2383d02feb6650e9255491ec7d0824f6e6dd2bbe3e521c469c985f31fb1"}, + {file = "zstandard-0.18.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28723a1d2e4df778573b76b321ebe9f3469ac98988104c2af116dd344802c3f8"}, + {file = "zstandard-0.18.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19cac7108ff2c342317fad6dc97604b47a41f403c8f19d0bfc396dfadc3638b8"}, + {file = "zstandard-0.18.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:76725d1ee83a8915100a310bbad5d9c1fc6397410259c94033b8318d548d9990"}, + {file = "zstandard-0.18.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d716a7694ce1fa60b20bc10f35c4a22be446ef7f514c8dbc8f858b61976de2fb"}, + {file = "zstandard-0.18.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:49685bf9a55d1ab34bd8423ea22db836ba43a181ac6b045ac4272093d5cb874e"}, + {file = "zstandard-0.18.0-cp38-cp38-win32.whl", hash = "sha256:1af1268a7dc870eb27515fb8db1f3e6c5a555d2b7bcc476fc3bab8886c7265ab"}, + {file = "zstandard-0.18.0-cp38-cp38-win_amd64.whl", hash = "sha256:1dc2d3809e763055a1a6c1a73f2b677320cc9a5aa1a7c6cfb35aee59bddc42d9"}, + {file = "zstandard-0.18.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eea18c1e7442f2aa9aff1bb84550dbb6a1f711faf6e48e7319de8f2b2e923c2a"}, + {file = "zstandard-0.18.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8677ffc6a6096cccbd892e558471c901fd821aba12b7fbc63833c7346f549224"}, + {file = "zstandard-0.18.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:083dc08abf03807af9beeb2b6a91c23ad78add2499f828176a3c7b742c44df02"}, + {file = "zstandard-0.18.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c990063664c08169c84474acecc9251ee035871589025cac47c060ff4ec4bc1a"}, + {file = "zstandard-0.18.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:533db8a6fac6248b2cb2c935e7b92f994efbdeb72e1ffa0b354432e087bb5a3e"}, + {file = "zstandard-0.18.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbb3cb8a082d62b8a73af42291569d266b05605e017a3d8a06a0e5c30b5f10f0"}, + {file = "zstandard-0.18.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d6c85ca5162049ede475b7ec98e87f9390501d44a3d6776ddd504e872464ec25"}, + {file = "zstandard-0.18.0-cp39-cp39-win32.whl", hash = "sha256:75479e7c2b3eebf402c59fbe57d21bc400cefa145ca356ee053b0a08908c5784"}, + {file = "zstandard-0.18.0-cp39-cp39-win_amd64.whl", hash = "sha256:d85bfabad444812133a92fc6fbe463e1d07581dba72f041f07a360e63808b23c"}, + {file = "zstandard-0.18.0.tar.gz", hash = "sha256:0ac0357a0d985b4ff31a854744040d7b5754385d1f98f7145c30e02c6865cb6f"}, +] From 32edf288e8cb59bbd18a5662e5697116704c66b6 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 9 Nov 2022 14:43:06 +0000 Subject: [PATCH 25/54] be extremely lazy --- pgscatalog_utils/config.py | 31 ++++- pgscatalog_utils/match/combine_matches.py | 29 +++-- pgscatalog_utils/match/filter.py | 4 +- pgscatalog_utils/match/match.py | 31 +---- pgscatalog_utils/match/match_variants.py | 68 ++++++----- pgscatalog_utils/match/preprocess.py | 13 +- pgscatalog_utils/match/read.py | 12 +- pgscatalog_utils/match/tempdir.py | 48 ++++++++ pgscatalog_utils/match/write.py | 62 +++++----- pgscatalog_utils/target.py | 140 ++++++++++++---------- 10 files changed, 265 insertions(+), 173 deletions(-) create mode 100644 pgscatalog_utils/match/tempdir.py diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index 0fe2bf9..1cff092 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -1,7 +1,22 @@ +import atexit import logging +import os +import tempfile -POLARS_MAX_THREADS: int = 1 # dummy value, is reset by args.n_threads (default: 1) +import polars as pl + +from pgscatalog_utils.match import tempdir + +N_THREADS: int = 1 # dummy value, is reset by args.n_threads (default: 1) OUTDIR: str = "." # dummy value, reset by args.outdir +TEMPDIR: tempfile.TemporaryDirectory = tempfile.TemporaryDirectory() + +logger = logging.getLogger(__name__) + + +def setup_cleaning(): + logger.debug(F"Temporary directory set up: {TEMPDIR.name}") + atexit.register(tempdir.cleanup) def set_logging_level(verbose: bool): @@ -16,3 +31,17 @@ def set_logging_level(verbose: bool): logging.basicConfig(level=logging.WARNING, format=log_fmt, datefmt='%Y-%m-%d %H:%M:%S') + + +def setup_polars_threads(n: int): + global N_THREADS + N_THREADS = n + os.environ['POLARS_MAX_THREADS'] = str(N_THREADS) + logger.debug(f"Using {N_THREADS} threads to read CSVs") + logger.debug(f"polars threadpool size: {pl.threadpool_size()}") + + if pl.threadpool_size() != N_THREADS: + logger.warning(f"polars threadpool doesn't match -n argument ({pl.threadpool_size()} vs {n})") + logger.info("To silence this warning, set POLARS_MAX_THREADS to match -n before running combine_matches, e.g.:") + logger.info("$ export POLARS_MAX_THREADS=x") + logger.info("$ combine_matches ... -n x") diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 0303840..f799729 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -1,6 +1,5 @@ import argparse import logging -import os import polars as pl @@ -14,29 +13,33 @@ def combine_matches(): args = _parse_args() config.set_logging_level(args.verbose) - - config.POLARS_MAX_THREADS = args.n_threads + config.setup_polars_threads(args.n_threads) config.OUTDIR = args.outdir - os.environ['POLARS_MAX_THREADS'] = str(config.POLARS_MAX_THREADS) # TODO: this won't work (after import) - # now the environment variable, parsed argument args.n_threads, and threadpool should agree - logger.debug(f"Setting POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}") - logger.debug(f"Using {config.POLARS_MAX_THREADS} threads to read CSVs") - logger.debug(f"polars threadpool size: {pl.threadpool_size()}") with pl.StringCache(): scorefile = read_scorefile(path=args.scorefile, chrom=None) # chrom=None to read all variants logger.debug("Reading matches") - matches = pl.concat([pl.read_ipc(x, memory_map=False, rechunk=False) for x in args.matches], rechunk=False) - logger.debug("Rechunking matches") - matches.rechunk() + matches = pl.concat([pl.scan_ipc(x, memory_map=False, rechunk=False) for x in args.matches], rechunk=False) # make sure there's no duplicate variant_ids across matches in multiple pvars # processing batched chromosomes with overlapping variants might cause problems # e.g. chr1 1-100000, chr1 100001-500000 - assert matches.filter(pl.col('match_status') == 'matched').groupby(['accession', 'ID']).count()['count'].max() == 1, "Duplicate IDs in final matches" + _check_duplicate_vars(matches) dataset = args.dataset.replace('_', '-') # _ used as delimiter in pgsc_calc - log_and_write(matches=matches.lazy(), scorefile=scorefile, dataset=dataset, args=args) + log_and_write(matches=matches, scorefile=scorefile, dataset=dataset, args=args) + + +def _check_duplicate_vars(matches: pl.LazyFrame): + max_occurrence: list[int] = (matches.filter(pl.col('match_status') == 'matched') + .groupby(['accession', 'ID']) + .agg(pl.count()) + .select('count') + .max() + .collect() + .get_column('count') + .to_list()) + assert max_occurrence == [1], "Duplicate IDs in final matches" def _parse_args(args=None): diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py index 14e3ed9..695b5c2 100644 --- a/pgscatalog_utils/match/filter.py +++ b/pgscatalog_utils/match/filter.py @@ -27,9 +27,7 @@ def filter_scores(scorefile: pl.LazyFrame, matches: pl.LazyFrame, min_overlap: f score_summary: pl.LazyFrame = pl.concat(scores).lazy() filtered_scores: pl.LazyFrame = (filtered_matches.join(score_summary, on='accession', how='left') - .filter(pl.col('score_pass') == True) - .select(['chr_name', 'ID', 'accession', 'effect_type', 'matched_effect_allele', - 'effect_weight'])) + .filter(pl.col('score_pass') == True)) return filtered_scores, score_summary diff --git a/pgscatalog_utils/match/match.py b/pgscatalog_utils/match/match.py index 4363dd5..ff2587d 100644 --- a/pgscatalog_utils/match/match.py +++ b/pgscatalog_utils/match/match.py @@ -1,7 +1,4 @@ -import gc import logging -import os -from tempfile import TemporaryDirectory import polars as pl @@ -9,7 +6,7 @@ # @profile # decorator needed to annotate memory profiles, but will cause NameErrors outside of profiling -def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, low_memory: bool = True) -> pl.LazyFrame: +def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame) -> list[pl.LazyFrame]: scorefile_oa = scorefile.filter(pl.col("other_allele") != None) scorefile_no_oa = scorefile.filter(pl.col("other_allele") == None) @@ -30,31 +27,7 @@ def get_all_matches(scorefile: pl.LazyFrame, target: pl.LazyFrame, low_memory: b matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_ref_flip").select(col_order)) matches.append(_match_variants(scorefile_no_oa, target, match_type="no_oa_alt_flip").select(col_order)) - if low_memory: - logger.debug("Batch collecting matches (low memory mode)") - match_lf = _batch_collect(matches) - else: - logger.debug("Collecting all matches (parallel)") - match_lf = pl.concat(pl.collect_all(matches)) - - return match_lf.lazy() - - -def _batch_collect(matches: list[pl.LazyFrame]) -> pl.DataFrame: - """ A slower alternative to pl.collect_all(), but this approach will use less peak memory - - This batches the .collect() and writes intermediate results to a temporary working directory - - IPC files are binary and remember column schema. Reading them can be extremely fast. """ - with TemporaryDirectory() as temp_dir: - n_chunks = 0 - for i, match in enumerate(matches): - out_path = os.path.join(temp_dir, str(i) + ".ipc") - match.collect().write_ipc(out_path) - n_chunks += 1 - logger.debug(f"Staged {n_chunks} match chunks to {temp_dir}") - gc.collect() - return pl.read_ipc(os.path.join(temp_dir, "*.ipc")) + return matches def _match_variants(scorefile: pl.LazyFrame, target: pl.LazyFrame, match_type: str) -> pl.LazyFrame: diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index bce3344..97486cc 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,18 +1,19 @@ import argparse import logging -import os +import shutil import sys import textwrap import polars as pl import pgscatalog_utils.config as config +from pgscatalog_utils.match import tempdir from pgscatalog_utils.match.filter import filter_scores from pgscatalog_utils.match.label import label_matches from pgscatalog_utils.match.log import make_logs, make_summary_log, check_log_count from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.read import read_target, read_scorefile -from pgscatalog_utils.match.write import write_log, write_out +from pgscatalog_utils.match.write import write_log, write_scorefiles logger = logging.getLogger(__name__) @@ -20,19 +21,14 @@ def match_variants(): args = _parse_args() config.set_logging_level(args.verbose) - - config.POLARS_MAX_THREADS = args.n_threads - os.environ['POLARS_MAX_THREADS'] = str(config.POLARS_MAX_THREADS) - # now the environment variable, parsed argument args.n_threads, and threadpool should agree - logger.debug(f"Setting POLARS_MAX_THREADS environment variable: {os.getenv('POLARS_MAX_THREADS')}") - logger.debug(f"Using {config.POLARS_MAX_THREADS} threads to read CSVs") - logger.debug(f"polars threadpool size: {pl.threadpool_size()}") + config.setup_polars_threads(args.n_threads) + config.setup_cleaning() with pl.StringCache(): scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile, chrom=args.chrom) target_paths = list(set(args.target)) n_target_files = len(target_paths) - matches: pl.DataFrame + matches: pl.LazyFrame if n_target_files == 0: logger.critical("No target genomes found, check the path") @@ -52,24 +48,24 @@ def match_variants(): case "single": logger.debug(f"Match mode: {match_mode}") # _fast_match with low_memory = True reads one target in chunks - matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args, low_memory) + matches: list[list[pl.LazyFrame]] = _fast_match(target_paths, scorefile, args, low_memory) case "multi": logger.debug(f"Match mode: {match_mode}") # iterate over multiple targets, in chunks - matches: pl.LazyFrame = _match_multiple_targets(target_paths, scorefile, args, low_memory) + matches: list[list[pl.LazyFrame]] = _match_multiple_targets(target_paths, scorefile, args, low_memory) case "fast": logger.debug(f"Match mode: {match_mode}") # _fast_match with low_memory = False just read everything into memory for speed - matches: pl.LazyFrame = _fast_match(target_paths, scorefile, args, low_memory) + matches: list[list[pl.LazyFrame]] = _fast_match(target_paths, scorefile, args, low_memory) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception dataset = args.dataset.replace('_', '-') # underscores are delimiters in pgs catalog calculator + match_dir, matches = _materialise_matches(matches, dataset, low_memory) if args.only_match: - fout: str = f"{dataset}_{args.chrom}_matches.ipc.zst" - logger.debug(f"--only_match set, writing out match candidates {fout} and exiting") - matches.collect().write_ipc(fout, compression="zstd") + logger.debug(f"--only_match set, writing out match candidates {match_dir} and exiting") + shutil.move(match_dir, args.outdir) logger.debug("Intermediate files can be processed with combine_matches") sys.exit(0) else: @@ -86,8 +82,7 @@ def log_and_write(matches: pl.LazyFrame, scorefile: pl.LazyFrame, dataset: str, logger.critical("Error: no target variants match any variants in scoring files") raise Exception("No valid matches found") - write_out(valid_matches, args.split, dataset) - del valid_matches + write_scorefiles(valid_matches, args.split, dataset) big_log: pl.LazyFrame = make_logs(scorefile=scorefile, match_candidates=matches, dataset=dataset) summary_log: pl.LazyFrame = make_summary_log(match_candidates=matches, filter_summary=filter_summary, @@ -95,10 +90,25 @@ def log_and_write(matches: pl.LazyFrame, scorefile: pl.LazyFrame, dataset: str, scorefile=scorefile) check_log_count(summary_log=summary_log, scorefile=scorefile) - write_log(df=big_log, prefix=dataset, chrom=None, outdir=args.outdir, file_format="csv") + write_log(df=big_log, prefix=dataset, chrom=None, outdir=args.outdir) summary_log.collect().write_csv(f"{dataset}_summary.csv") +def _materialise_matches(matches: list[list[pl.LazyFrame]], dataset: str, low_memory: bool) -> tuple[str, pl.LazyFrame]: + """ Collect query plan and store results in temporary files""" + # outer list: [target_1, target_2] + # inner list: [ match_1, match_2 ] + for i, match in enumerate(matches): + fout = tempdir.get_tmp_path("matches", f"match_{i}.ipc.zst") + if low_memory: + pl.concat([x.collect() for x in match]).write_ipc(fout) + else: + pl.concat(pl.collect_all(match)).write_ipc(fout) + match_dir: str = tempdir.get_tmp_path("matches", "") + ldf: pl.LazyFrame = pl.scan_ipc(match_dir + "*.ipc.zst", memory_map=False) + return match_dir, ldf + + def _check_target_chroms(target: pl.LazyFrame) -> None: chroms: list[str] = target.select(pl.col("#CHROM").unique()).collect().get_column("#CHROM").to_list() if len(chroms) > 1: @@ -109,28 +119,28 @@ def _check_target_chroms(target: pl.LazyFrame) -> None: def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, - args: argparse.Namespace, low_memory: bool) -> pl.LazyFrame: + args: argparse.Namespace, low_memory: bool) -> list[list[pl.LazyFrame]]: # fast match is fast because: # 1) all target files are read into memory without batching # 2) matching occurs without iterating through chromosomes - # when low memory is true and n_targets = 1, fast match is the same as "single" match mode params: dict[str, bool] = _make_params_dict(args) target: pl.LazyFrame = read_target(paths=target_paths, low_memory=low_memory) - return (get_all_matches(scorefile=scorefile, target=target, low_memory=low_memory) - .pipe(label_matches, params=params)) + matches = get_all_matches(scorefile=scorefile, target=target) + return [[x.pipe(label_matches, params=params) for x in matches]] def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, args: argparse.Namespace, - low_memory: bool) -> pl.LazyFrame: - matches = [] + low_memory: bool) -> list[list[pl.LazyFrame]]: + match_lst = [] params: dict[str, bool] = _make_params_dict(args) for i, loc_target_current in enumerate(target_paths): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.LazyFrame = read_target(paths=[loc_target_current], low_memory=low_memory) - _check_target_chroms(target) - matches.append(get_all_matches(scorefile=scorefile, target=target, low_memory=low_memory)) - return (pl.concat(matches) - .pipe(label_matches, params=params)) + if len(target_paths) > 1: + _check_target_chroms(target) + matches: list[pl.LazyFrame] = get_all_matches(scorefile=scorefile, target=target) + match_lst.append([x.pipe(label_matches, params=params) for x in matches]) + return match_lst def _description_text() -> str: diff --git a/pgscatalog_utils/match/preprocess.py b/pgscatalog_utils/match/preprocess.py index 9997176..405aa07 100644 --- a/pgscatalog_utils/match/preprocess.py +++ b/pgscatalog_utils/match/preprocess.py @@ -5,7 +5,7 @@ logger = logging.getLogger(__name__) -def filter_target(df: pl.DataFrame) -> pl.DataFrame: +def filter_target(df: pl.LazyFrame) -> pl.LazyFrame: """ Remove variants that won't be matched against the scorefile Chromosomes 1 - 22, X, and Y with an efficient join. Remmove variants with missing identifiers also @@ -15,7 +15,7 @@ def filter_target(df: pl.DataFrame) -> pl.DataFrame: return df.filter((pl.col('#CHROM').is_in(chroms)) & (pl.col('ID') != '.')) -def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataFrame: +def complement_valid_alleles(df: pl.LazyFrame, flip_cols: list[str]) -> pl.LazyFrame: """ Improved function to complement alleles. Will only complement sequences that are valid DNA. """ for col in flip_cols: @@ -37,19 +37,18 @@ def complement_valid_alleles(df: pl.DataFrame, flip_cols: list[str]) -> pl.DataF return df -def annotate_multiallelic(df: pl.DataFrame) -> pl.DataFrame: +def annotate_multiallelic(df: pl.LazyFrame) -> pl.LazyFrame: """ Identify variants that are multiallelic with a column flag """ # plink2 pvar multi-alleles are comma-separated - df: pl.DataFrame = (df.with_column( + df: pl.LazyFrame = (df.with_column( pl.when(pl.col("ALT").str.contains(',')) .then(pl.lit(True)) .otherwise(pl.lit(False)) .alias('is_multiallelic'))) - if (df.get_column('is_multiallelic')).any(): + if (df.select('is_multiallelic').unique().collect().get_column('is_multiallelic')).any(): logger.debug("Exploding dataframe to handle multiallelic variants") - df.replace('ALT', df['ALT'].str.split(by=',')) # turn ALT to list of variants - return df.explode('ALT') # expand the DF to have all the variants in different rows + return df.with_column(pl.col('ALT').str.split(by=',')).explode('ALT') else: logger.debug("No multiallelic variants detected") return df diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index b98ff2e..a77f037 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -3,6 +3,8 @@ import typing import polars as pl +from pgscatalog_utils.match.tempdir import get_tmp_path + from pgscatalog_utils.match.preprocess import annotate_multiallelic, complement_valid_alleles, filter_target from pgscatalog_utils.target import Target @@ -17,7 +19,7 @@ def read_target(paths: list[str], low_memory: bool) -> pl.LazyFrame: return (pl.concat([x.read() for x in targets]) .pipe(filter_target) .pipe(annotate_multiallelic) - .with_column(pl.col('ALT').cast(pl.Categorical))).lazy() + .with_column(pl.col('ALT').cast(pl.Categorical))) def read_scorefile(path: str, chrom: typing.Union[str, None]) -> pl.LazyFrame: @@ -28,7 +30,13 @@ def read_scorefile(path: str, chrom: typing.Union[str, None]) -> pl.LazyFrame: 'other_allele': pl.Utf8, 'effect_type': pl.Categorical, 'accession': pl.Categorical} - ldf = pl.read_csv(path, sep = '\t', dtype=dtypes).lazy() + + # parse CSV and write to temporary feather file + # enforce laziness! scanning is very fast and saves memory + fout: str = get_tmp_path("scorefile", "scorefile.ipc") + (pl.read_csv(path, sep='\t', dtype=dtypes).write_ipc(fout)) + ldf: pl.LazyFrame = pl.scan_ipc(fout) + if chrom is not None: logger.debug(f"--chrom set, filtering scoring file to chromosome {chrom}") ldf = ldf.filter(pl.col('chr_name') == chrom) # add filter to query plan diff --git a/pgscatalog_utils/match/tempdir.py b/pgscatalog_utils/match/tempdir.py new file mode 100644 index 0000000..e937b99 --- /dev/null +++ b/pgscatalog_utils/match/tempdir.py @@ -0,0 +1,48 @@ +import logging +import os + +from pgscatalog_utils import config + +logger = logging.getLogger(__name__) + + +def get_tmp_path(subdir: str, fn: str) -> str: + """ Create a subdirectory in the tempodir and return a full path to fn + + subdir: 'input', fn: 'test.txt' -> '/path/tp/tmpdir/input/test.txt' + """ + path: str = os.path.join(config.TEMPDIR.name, subdir) + if not os.path.exists(path): + os.mkdir(path) + + return os.path.join(path, fn) + + +def cleanup(): + """ A temporary directory is used to store staged data in feather format. + + tempdir/ + ├── target + ├── scorefile + └── matched + + Data are staged to disk for a few different reasons: + + target/ and scorefile/: + - Raw text data may be compressed with zstd or gzip, which can't be lazily read + - Parsing a very large file causes big RAM spikes + - To mitigate this, optionally read and parse in batches (compressed + uncompressed) + - ipc are uncompressed to allow fast memory mapping + - These files should always be cleaned up by python or the host OS (SIGTERM breaks atexit) + + matched/ + - Split - apply - combine means a common use case is to just write matches and exit + - In this case, files are saved (moved to args.outdir) and used as input to combine_matches + - In other cases, post-processing of matches makes complex query plans, which failed when collected + - Re-scanning collected files on disk prevents this problem + - ipc are compressed to save space. Further processing takes some time, so decompression is ok. + + This function is registered with atexit to run when the program ends. + """ + logger.debug(f"Cleaning up tempdir path {config.TEMPDIR}") + config.TEMPDIR.cleanup() diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 33d785f..327722b 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -9,8 +9,23 @@ logger = logging.getLogger(__name__) -def write_out(matches: pl.LazyFrame, split: bool, dataset: str): - chroms: list[str] = matches.select("chr_name").unique().collect().get_column("chr_name").to_list() +def write_log(df: pl.LazyFrame, prefix: str, chrom: typing.Union[str, None], outdir: str) -> None: + if chrom is None: + log_name: str = os.path.join(os.path.abspath(outdir), f"{prefix}_log") + else: + log_name: str = os.path.join(os.path.abspath(outdir), f"{prefix}_chrom{chrom}_log") + + fout: str = ''.join([log_name, ".csv.gz"]) + if os.path.exists(fout): + logger.warning(f"Overwriting log that already exists: {fout}") + os.remove(fout) + + _write_text_pgzip(df=df, fout=fout) + + +def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): + # TODO: fix + chroms: list[str] = matches.select("chr_name").unique().collect(projection_pushdown=False).get_column("chr_name").to_list() for chrom in chroms: # 1. filter by chromosome chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom) @@ -38,36 +53,29 @@ def _write_split(deduplicated: dict[str: tuple[int, pl.LazyFrame]], chrom: str, # pivoting is !! _expensive_ !! (it collects the lazyframe) pivoted: pl.LazyFrame = _pivot_score(et_df, chrom) fout = os.path.join(config.OUTDIR, f"{dataset}_{chrom}_{effect_type}_{i}.scorefile.gz") - _write_scorefile(pivoted, fout) + _write_text_pgzip(pivoted, fout) -def _write_scorefile(df, fout): - logger.debug(f"Writing matched scorefile to {fout}") - with pgzip.open(fout, 'wb', thread=config.POLARS_MAX_THREADS) as f: - df.collect().write_csv(f) - +def _write_text_pgzip(df: pl.LazyFrame, fout: str, append: bool = False): + """ Write a df to a text file (e.g. CSV / TSV) using parallel gzip, optionally appending to an existing file -def write_log(df: pl.LazyFrame, prefix: str, chrom: typing.Union[str, None], file_format: str, outdir: str) -> None: - # feather file preserves dtypes and is small - # don't compress the feather file to allow memory mapping - if chrom is None: - log_name: str = os.path.join(os.path.abspath(outdir), f"{prefix}_log") + Notes: + - Compression performance isn't ideal when concatenating gzip streams (append = True) + - Generally it's best to feed compression algorithms all data and write in one go + - However, df will normally be very big + - It's collected for the first time in this function, and joins _a lot_ of data (contains all match candidates) + - The files created by this function must be human-readable text files, so feather / parquet isn't helpful + - Hopefully appending gzip streams is a reasonable compromise to mitigate OOM errors + """ + if append: + logger.debug(f"Appending to {fout}") + mode = 'ab' else: - log_name: str = os.path.join(os.path.abspath(outdir), f"{prefix}_chrom{chrom}_log") + logger.debug(f"Writing to {fout}") + mode = 'wb' - match file_format: - case 'ipc': - fout: str = ''.join([log_name, ".ipc.zst"]) - logger.debug(f"Writing {fout} in format: {file_format}") - df.collect().write_ipc(fout, compression='zstd') # gzip compression not supported - case 'csv': - fout: str = ''.join([log_name, ".csv.gz"]) - logger.debug(f"Writing {fout} in format: {file_format}") - with pgzip.open(fout, 'wb', thread=config.POLARS_MAX_THREADS) as f: - df.collect().write_csv(f) - case _: - logger.critical(f"Invalid format: {file_format}") - raise Exception + with pgzip.open(fout, mode, thread=config.N_THREADS) as f: + df.collect().write_csv(f) def _pivot_score(df: pl.LazyFrame, chrom: str) -> pl.LazyFrame: diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index 3573ee6..ab52971 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -2,14 +2,15 @@ import io import logging import os +import pathlib from dataclasses import dataclass from itertools import islice -from tempfile import TemporaryDirectory import polars as pl import zstandard import pgscatalog_utils.config as config +from pgscatalog_utils.match.tempdir import get_tmp_path logger = logging.getLogger(__name__) @@ -57,30 +58,42 @@ def read(self): logger.debug("Reading uncompressed target genome (fast mode, high RAM usage)") return self._read_uncompressed() - def _read_compressed(self) -> pl.DataFrame: + def _read_compressed(self) -> pl.LazyFrame: """ Read a zst compressed target as quickly as possible """ with open(self.path, 'rb') as fh: dctx = zstandard.ZstdDecompressor() with dctx.stream_reader(fh) as reader: dtypes = _get_col_dtypes(self.file_format) col_idxs, new_col_names = _default_cols(self.file_format) - return (pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', - dtype=dtypes, - columns=col_idxs, - new_columns=new_col_names, - n_threads=config.POLARS_MAX_THREADS)) - def _read_uncompressed(self) -> pl.DataFrame: + fn: str = pathlib.Path(self.path).stem + ".ipc" + fout = get_tmp_path("input", fn) + + (pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', + dtype=dtypes, + columns=col_idxs, + new_columns=new_col_names, + n_threads=config.N_THREADS) + .write_ipc(fout)) + return pl.scan_ipc(fout) + + def _read_uncompressed(self) -> pl.LazyFrame: """ Read an uncompressed target as quickly as possible. Uses up to 16GB RAM on 1000 genomes pvar. """ dtypes = _get_col_dtypes(self.file_format) col_idxs, new_col_names = _default_cols(self.file_format) - return (pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', - dtype=dtypes, - columns=col_idxs, - new_columns=new_col_names, - n_threads=config.POLARS_MAX_THREADS)) - def _read_uncompressed_chunks(self) -> pl.DataFrame: + fn: str = pathlib.Path(self.path).stem + ".ipc" + fout: str = get_tmp_path("input", fn) + + (pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', + dtype=dtypes, + columns=col_idxs, + new_columns=new_col_names, + n_threads=config.N_THREADS) + .write_ipc(fout)) + return pl.scan_ipc(fout) + + def _read_uncompressed_chunks(self) -> pl.LazyFrame: """ Read a CSV using a BufferedReader in batches to reduce memory usage. Reads 1 million variant chunks and immediately writes to feather format in a temporary directory. @@ -91,29 +104,31 @@ def _read_uncompressed_chunks(self) -> pl.DataFrame: """ dtypes = _get_col_dtypes(self.file_format) col_idxs, new_col_names = _default_cols(self.file_format) - with TemporaryDirectory() as temp_dir: - batch_n = 0 - batch_size = int(1e6) - with open(self.path, 'rb') as f: - while True: - line_batch = b''.join(islice(f, batch_size)) - if not line_batch: - break - - out_path = os.path.join(temp_dir, str(batch_n) + '.ipc') - - (pl.read_csv(line_batch, sep='\t', has_header=False, comment_char='#', - dtype=dtypes, - columns=col_idxs, - new_columns=new_col_names, - n_threads=config.POLARS_MAX_THREADS).write_ipc(out_path)) - batch_n += 1 - - gc.collect() # just to be safe - logger.debug(f"{batch_n} batches staged in temporary directory {temp_dir}") - return pl.read_ipc(os.path.join(temp_dir, "*.ipc")) - def _read_compressed_chunks(self) -> pl.DataFrame: + batch_n = 0 + batch_size = int(1e6) + with open(self.path, 'rb') as f: + while True: + line_batch = b''.join(islice(f, batch_size)) + if not line_batch: + break + + fn: str = str(batch_n) + ".ipc" + fout: str = get_tmp_path("input", fn) + + (pl.read_csv(line_batch, sep='\t', has_header=False, comment_char='#', + dtype=dtypes, + columns=col_idxs, + new_columns=new_col_names, + n_threads=config.N_THREADS) + .write_ipc(fout)) + batch_n += 1 + + gc.collect() # just to be safe + logger.debug(f"{batch_n} batches staged in temporary directory {config.TEMPDIR}") + return pl.scan_ipc(os.path.join(config.TEMPDIR.name, "input", "*.ipc")) + + def _read_compressed_chunks(self) -> pl.LazyFrame: """ Like _read_uncompressed_chunks, but read chunks of bytes and handle incomplete rows zstd returns chunks of bytes, not lines, but encoding utf-8 will be faster in rust and polars @@ -123,38 +138,38 @@ def _read_compressed_chunks(self) -> pl.DataFrame: columns, new_col_names = _default_cols(self.file_format) n_chunks = 0 + with open(self.path, 'rb') as fh: + dctx = zstandard.ZstdDecompressor() + chunk_buffer = b'' - with TemporaryDirectory() as temp_dir: - with open(self.path, 'rb') as fh: - dctx = zstandard.ZstdDecompressor() - chunk_buffer = b'' + for chunk in dctx.read_to_iter(fh, read_size=int(1e8), write_size=int(1e8)): + if not chunk: + logger.debug("Finished reading zstd compressed chunks") + break - for chunk in dctx.read_to_iter(fh, read_size=int(1e8), write_size=int(1e8)): - if not chunk: - logger.debug("Finished reading zstd compressed chunks") - break + end = chunk.rfind(b'\n') + 1 # only want to read complete rows, which end in \n + if chunk_buffer: + row_chunk = b''.join([chunk_buffer, chunk[:end]]) + chunk_buffer = b'' + else: + row_chunk = chunk[:end] - end = chunk.rfind(b'\n') + 1 # only want to read complete rows, which end in \n - if chunk_buffer: - row_chunk = b''.join([chunk_buffer, chunk[:end]]) - chunk_buffer = b'' - else: - row_chunk = chunk[:end] + fn: str = str(n_chunks) + ".ipc" + fout: str = get_tmp_path("input", fn) - out_path = os.path.join(temp_dir, str(n_chunks) + ".ipc") - (pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#', - dtype=dtypes, - columns=columns, - new_columns=new_col_names, - n_threads=config.POLARS_MAX_THREADS) - .write_ipc(out_path)) + (pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#', + dtype=dtypes, + columns=columns, + new_columns=new_col_names, + n_threads=config.N_THREADS) + .write_ipc(fout)) - chunk_buffer = b''.join([chunk_buffer, chunk[end:]]) - n_chunks += 1 + chunk_buffer = b''.join([chunk_buffer, chunk[end:]]) + n_chunks += 1 - gc.collect() # just to be safe - logger.debug(f"{n_chunks} chunks") # write_size will change n_chunks - return pl.read_ipc(os.path.join(temp_dir, "*.ipc")) + gc.collect() # just to be safe + logger.debug(f"{n_chunks} chunks") # write_size will change n_chunks + return pl.scan_ipc(os.path.join(config.TEMPDIR.name, "input", "*.ipc")) def _get_col_dtypes(file_format): @@ -219,3 +234,4 @@ def _default_cols(file_format) -> tuple[list[int], list[str]]: case _: logger.critical("Trying to get column idx for an invalid file format, TWENTY THREE NINETEEN") raise Exception + From 343a79926e652e84326c88e64aab3ece7c5774ce Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 9 Nov 2022 15:29:30 +0000 Subject: [PATCH 26/54] Better log message --- pgscatalog_utils/match/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/filter.py b/pgscatalog_utils/match/filter.py index 695b5c2..9c912ee 100644 --- a/pgscatalog_utils/match/filter.py +++ b/pgscatalog_utils/match/filter.py @@ -40,7 +40,7 @@ def _calculate_match_rate(df: pl.LazyFrame) -> pl.LazyFrame: def _filter_matches(df: pl.LazyFrame) -> pl.LazyFrame: - logger.debug("Filtering variants with exclude flag") + logger.debug("Filtering to best_match variants (with exclude flag = False)") return df.filter((pl.col('best_match') == True) & (pl.col('exclude') == False)) From fc6e604c7ab5c37de9b520ef0bfd32b62013ad03 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 9 Nov 2022 15:31:39 +0000 Subject: [PATCH 27/54] don't .fetch() --- pgscatalog_utils/match/match_variants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 97486cc..680f524 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -78,7 +78,8 @@ def log_and_write(matches: pl.LazyFrame, scorefile: pl.LazyFrame, dataset: str, valid_matches, filter_summary = filter_scores(scorefile=scorefile, matches=matches, dataset=dataset, min_overlap=args.min_overlap) - if valid_matches.fetch().is_empty(): # this can happen if args.min_overlap = 0 + if filter_summary.filter(pl.col("score_pass") == True).collect().is_empty(): + # this can happen when args.min_overlap = 0 logger.critical("Error: no target variants match any variants in scoring files") raise Exception("No valid matches found") From f671ea03b453f64f209ed3332078a59ae3782319 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Wed, 9 Nov 2022 15:57:46 +0000 Subject: [PATCH 28/54] Add ability to make a single scoring file --- pgscatalog_utils/match/combine_matches.py | 2 +- pgscatalog_utils/match/write.py | 33 ++++++++++++++++------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index f799729..7e38807 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -50,7 +50,7 @@ def _parse_args(args=None): type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('-s', '--scorefile', dest='scorefile', required=True, help=' Path to scorefile') - parser.add_argument('--split', dest='split', default=True, action='store_true', + parser.add_argument('--split', dest='split', default=False, action='store_true', help=' Split scorefile per chromosome?') parser.add_argument('-m', '--matches', dest='matches', required=True, nargs='+', help=' List of match files') diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 327722b..9492733 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -24,23 +24,36 @@ def write_log(df: pl.LazyFrame, prefix: str, chrom: typing.Union[str, None], out def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): - # TODO: fix - chroms: list[str] = matches.select("chr_name").unique().collect(projection_pushdown=False).get_column("chr_name").to_list() - for chrom in chroms: - # 1. filter by chromosome - chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom) - # 2. split by effect type + if split: + chroms: list[str] = matches.select("chr_name").unique().collect(projection_pushdown=False).get_column("chr_name").to_list() + for chrom in chroms: + # 1. filter by chromosome + chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom) + # 2. split by effect type + additive: pl.LazyFrame + dominant: pl.LazyFrame + recessive: pl.LazyFrame + additive, dominant, recessive = _split_effect_type(chrom_df) + + # 3. deduplicate + effect_types = ['additive', 'dominant', 'recessive'] + deduped = dict(zip(effect_types, [_deduplicate_variants(x) for x in [additive, dominant, recessive]])) + + # 4. pivot and write! + _write_split(deduped, chrom, dataset) + else: + # 1. split by effect type additive: pl.LazyFrame dominant: pl.LazyFrame recessive: pl.LazyFrame - additive, dominant, recessive = _split_effect_type(chrom_df) + additive, dominant, recessive = _split_effect_type(matches) - # 3. deduplicate + # 2. deduplicate effect_types = ['additive', 'dominant', 'recessive'] deduped = dict(zip(effect_types, [_deduplicate_variants(x) for x in [additive, dominant, recessive]])) - # 4. pivot and write! - _write_split(deduped, chrom, dataset) + # 3. pivot and write! + _write_split(deduped, 'ALL', dataset) def _write_split(deduplicated: dict[str: tuple[int, pl.LazyFrame]], chrom: str, dataset: str): From 334edce88fcc8970df3f938d776c3589b82ef307 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 9 Nov 2022 16:59:38 +0000 Subject: [PATCH 29/54] update args --- pgscatalog_utils/match/combine_matches.py | 3 +- pgscatalog_utils/match/match_variants.py | 34 +++++++++++++++-------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 7e38807..c6cf78b 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -56,9 +56,10 @@ def _parse_args(args=None): help=' List of match files') parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') + parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') - parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) + parser = add_label_args(parser) return parser.parse_args(args) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 680f524..3ac5ac1 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -213,6 +213,13 @@ def _parse_args(args=None): help=' Split scorefile per chromosome?') parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help=' Extra logging information') + parser = add_label_args(parser) + return _check_args(parser.parse_args(args)) + + +def add_label_args(parser): parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false', help=''' Flag to force the program to keep variants with ambiguous alleles, (e.g. A/T and G/C SNPs), which are normally @@ -229,10 +236,7 @@ def _parse_args(args=None): parser.add_argument('--keep_first_match', dest='keep_first_match', action='store_true', help=''' If multiple match candidates for a variant exist that can't be prioritised, keep the first match candidate (default: drop all candidates)''') - parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help=' Extra logging information') - return _check_args(parser.parse_args(args)) - + return parser def _check_args(args): if args.chrom is not None and not args.only_match: @@ -257,17 +261,23 @@ def _check_args(args): # not writing scoring files, so split output doesn't make sense logger.critical("Invalid arguments: --only_match and --split (pick one!)") sys.exit(1) + label_error = False + if args.only_match and args.keep_first_match: + label_error = True + if args.only_match and args.ignore_strand_flips: + label_error = True + if args.only_match and args.keep_multiallelic: + label_error = True + if args.only_match and args.remove_ambiguous: + label_error = True + if label_error: + logger.critical("Invalid arguments: --only_match and --keep_first_match, --ignore_strand_flips," + "keep_multiallelic, or keep_ambiguous") + logger.critical("Pass these arguments to combine_matches instead") + sys.exit(1) return args -def _make_params_dict(args) -> dict[str, bool]: - """ Make a dictionary with parameters that control labelling match candidates """ - return {'keep_first_match': args.keep_first_match, - 'remove_ambiguous': args.remove_ambiguous, - 'skip_flip': args.skip_flip, - 'remove_multiallelic': args.remove_multiallelic} - - if __name__ == "__main__": match_variants() From a16c734ee9e44daf91995b4b2fe898708c0b673a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 9 Nov 2022 16:59:44 +0000 Subject: [PATCH 30/54] fix labelling --- pgscatalog_utils/match/combine_matches.py | 7 +++++- pgscatalog_utils/match/label.py | 21 +++++++++++------- pgscatalog_utils/match/match_variants.py | 26 +++++++++++------------ 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index c6cf78b..7735f9b 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -4,7 +4,8 @@ import polars as pl from pgscatalog_utils import config -from pgscatalog_utils.match.match_variants import log_and_write +from pgscatalog_utils.match.label import make_params_dict, label_matches +from pgscatalog_utils.match.match_variants import log_and_write, add_label_args from pgscatalog_utils.match.read import read_scorefile logger = logging.getLogger(__name__) @@ -21,6 +22,10 @@ def combine_matches(): logger.debug("Reading matches") matches = pl.concat([pl.scan_ipc(x, memory_map=False, rechunk=False) for x in args.matches], rechunk=False) + logger.debug("Labelling match candidates") + params: dict[str, bool] = make_params_dict(args) + matches = matches.pipe(label_matches, params) + # make sure there's no duplicate variant_ids across matches in multiple pvars # processing batched chromosomes with overlapping variants might cause problems # e.g. chr1 1-100000, chr1 100001-500000 diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py index 1c55ba3..d1c1417 100644 --- a/pgscatalog_utils/match/label.py +++ b/pgscatalog_utils/match/label.py @@ -7,6 +7,14 @@ logger = logging.getLogger(__name__) +def make_params_dict(args) -> dict[str, bool]: + """ Make a dictionary with parameters that control labelling match candidates """ + return {'keep_first_match': args.keep_first_match, + 'remove_ambiguous': args.remove_ambiguous, + 'skip_flip': args.skip_flip, + 'remove_multiallelic': args.remove_multiallelic} + + def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame: """ Label match candidates with additional metadata. Column definitions: @@ -92,17 +100,14 @@ def _label_duplicate_best_match(df: pl.LazyFrame) -> pl.LazyFrame: .otherwise(pl.lit(False)) .alias('duplicate_best_match')) .drop('count') - .rename({'row_nr': 'score_row_nr'}) - .with_row_count() # add temporary row count to get first variant + .with_row_count(name='temp_row_nr') # add temporary row count to get first variant .with_column(pl.when((pl.col("best_match") == True) & (pl.col("duplicate_best_match") == True) & - (pl.col("row_nr") > pl.min("row_nr")).over( - ["accession", "score_row_nr"])) + (pl.col("temp_row_nr") > pl.min("temp_row_nr")).over( + ["accession", "row_nr"])) .then(False) # reset best match flag for duplicates .otherwise(pl.col("best_match")) # just keep value from existing column - .alias('best_match_duplicate_row_nr')) - .drop(['row_nr', 'best_match']) - .rename({'score_row_nr': 'row_nr', 'best_match_duplicate_row_nr': 'best_match'})) + .alias('best_match'))) return labelled @@ -209,4 +214,4 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame: .alias('exclude')) else: logger.debug("Not excluding flipped matches") - return df \ No newline at end of file + return df diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 3ac5ac1..074eea1 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -9,7 +9,7 @@ import pgscatalog_utils.config as config from pgscatalog_utils.match import tempdir from pgscatalog_utils.match.filter import filter_scores -from pgscatalog_utils.match.label import label_matches +from pgscatalog_utils.match.label import label_matches, make_params_dict from pgscatalog_utils.match.log import make_logs, make_summary_log, check_log_count from pgscatalog_utils.match.match import get_all_matches from pgscatalog_utils.match.read import read_target, read_scorefile @@ -48,14 +48,14 @@ def match_variants(): case "single": logger.debug(f"Match mode: {match_mode}") # _fast_match with low_memory = True reads one target in chunks - matches: list[list[pl.LazyFrame]] = _fast_match(target_paths, scorefile, args, low_memory) + matches: list[list[pl.LazyFrame]] = _fast_match(target_paths, scorefile, low_memory) case "multi": logger.debug(f"Match mode: {match_mode}") # iterate over multiple targets, in chunks - matches: list[list[pl.LazyFrame]] = _match_multiple_targets(target_paths, scorefile, args, low_memory) + matches: list[list[pl.LazyFrame]] = _match_multiple_targets(target_paths, scorefile, low_memory) case "fast": logger.debug(f"Match mode: {match_mode}") # _fast_match with low_memory = False just read everything into memory for speed - matches: list[list[pl.LazyFrame]] = _fast_match(target_paths, scorefile, args, low_memory) + matches: list[list[pl.LazyFrame]] = _fast_match(target_paths, scorefile, low_memory) case _: logger.critical(f"Invalid match mode: {match_mode}") raise Exception @@ -69,6 +69,9 @@ def match_variants(): logger.debug("Intermediate files can be processed with combine_matches") sys.exit(0) else: + logger.debug("Labelling match candidates") + params: dict[str, bool] = make_params_dict(args) + matches = matches.pipe(label_matches, params) logger.debug("Filtering match candidates and making scoring files") log_and_write(matches=matches, scorefile=scorefile, dataset=dataset, args=args) @@ -100,7 +103,7 @@ def _materialise_matches(matches: list[list[pl.LazyFrame]], dataset: str, low_me # outer list: [target_1, target_2] # inner list: [ match_1, match_2 ] for i, match in enumerate(matches): - fout = tempdir.get_tmp_path("matches", f"match_{i}.ipc.zst") + fout = tempdir.get_tmp_path("matches", f"{dataset}_match_{i}.ipc.zst") if low_memory: pl.concat([x.collect() for x in match]).write_ipc(fout) else: @@ -119,28 +122,23 @@ def _check_target_chroms(target: pl.LazyFrame) -> None: logger.debug("Split target genome contains one chromosome (good)") -def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, - args: argparse.Namespace, low_memory: bool) -> list[list[pl.LazyFrame]]: +def _fast_match(target_paths: list[str], scorefile: pl.LazyFrame, low_memory: bool) -> list[list[pl.LazyFrame]]: # fast match is fast because: # 1) all target files are read into memory without batching # 2) matching occurs without iterating through chromosomes - params: dict[str, bool] = _make_params_dict(args) target: pl.LazyFrame = read_target(paths=target_paths, low_memory=low_memory) - matches = get_all_matches(scorefile=scorefile, target=target) - return [[x.pipe(label_matches, params=params) for x in matches]] + return [get_all_matches(scorefile=scorefile, target=target)] -def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, args: argparse.Namespace, +def _match_multiple_targets(target_paths: list[str], scorefile: pl.LazyFrame, low_memory: bool) -> list[list[pl.LazyFrame]]: match_lst = [] - params: dict[str, bool] = _make_params_dict(args) for i, loc_target_current in enumerate(target_paths): logger.debug(f'Matching scorefile(s) against target: {loc_target_current}') target: pl.LazyFrame = read_target(paths=[loc_target_current], low_memory=low_memory) if len(target_paths) > 1: _check_target_chroms(target) - matches: list[pl.LazyFrame] = get_all_matches(scorefile=scorefile, target=target) - match_lst.append([x.pipe(label_matches, params=params) for x in matches]) + match_lst.append(get_all_matches(scorefile=scorefile, target=target)) return match_lst From a606096c27085a2e70b3e99f039dfcdd763b7378 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 10 Nov 2022 13:00:32 +0000 Subject: [PATCH 31/54] fix tests --- pgscatalog_utils/match/match_variants.py | 1 + pgscatalog_utils/match/write.py | 22 +++++++++++++++------- tests/match/test_label.py | 12 ++++++------ tests/match/test_match.py | 12 ++++++------ 4 files changed, 28 insertions(+), 19 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 074eea1..7354bc1 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -236,6 +236,7 @@ def add_label_args(parser): keep the first match candidate (default: drop all candidates)''') return parser + def _check_args(args): if args.chrom is not None and not args.only_match: # filtering the scoring file will break overlap assumptions and calculations diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 9492733..f4a97e0 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -24,15 +24,16 @@ def write_log(df: pl.LazyFrame, prefix: str, chrom: typing.Union[str, None], out def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): + _check_column_types(matches) + additive: pl.LazyFrame + dominant: pl.LazyFrame + recessive: pl.LazyFrame if split: - chroms: list[str] = matches.select("chr_name").unique().collect(projection_pushdown=False).get_column("chr_name").to_list() + chroms: list[str] = matches.select("chr_name").unique().collect().get_column("chr_name").to_list() for chrom in chroms: # 1. filter by chromosome chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom) # 2. split by effect type - additive: pl.LazyFrame - dominant: pl.LazyFrame - recessive: pl.LazyFrame additive, dominant, recessive = _split_effect_type(chrom_df) # 3. deduplicate @@ -43,9 +44,6 @@ def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): _write_split(deduped, chrom, dataset) else: # 1. split by effect type - additive: pl.LazyFrame - dominant: pl.LazyFrame - recessive: pl.LazyFrame additive, dominant, recessive = _split_effect_type(matches) # 2. deduplicate @@ -56,6 +54,16 @@ def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): _write_split(deduped, 'ALL', dataset) +def _check_column_types(matches: pl.LazyFrame): + logger.debug("Checking column types") + # these columns are most important for writing out + correct_schema = {'chr_name': pl.Utf8, 'chr_position': pl.UInt64, 'ID': pl.Utf8, + 'matched_effect_allele': pl.Categorical, 'effect_weight': pl.Float64, + 'effect_type': pl.Categorical, 'accession': pl.Categorical} + col_types = {x: matches.schema.get(x) for x in list((matches.schema.keys() & correct_schema.keys()))} + assert col_types == correct_schema + + def _write_split(deduplicated: dict[str: tuple[int, pl.LazyFrame]], chrom: str, dataset: str): for effect_type, df_lst in deduplicated.items(): for i, et_df in df_lst: diff --git a/tests/match/test_label.py b/tests/match/test_label.py index e3e531f..ca9dada 100644 --- a/tests/match/test_label.py +++ b/tests/match/test_label.py @@ -31,7 +31,7 @@ def test_label(small_scorefile, small_target): # get_all_matches calls label_matches params = {'skip_flip': True, 'remove_ambiguous': True, 'remove_multiallelic': False, 'keep_first_match': False} - labelled: pl.DataFrame = (get_all_matches(scorefile=scorefile, target=target) + labelled: pl.DataFrame = (pl.concat(get_all_matches(scorefile=scorefile, target=target)) .pipe(label_matches, params=params) .collect()) @@ -47,7 +47,7 @@ def test_ambiguous_label(small_flipped_scorefile, small_target): """ Test ambiguous variant labels change when they're kept for match candidates with one match per position """ scorefile, target = _cast_cat(small_flipped_scorefile, small_target) no_flip = {'skip_flip': True, 'remove_ambiguous': True, 'remove_multiallelic': False, 'keep_first_match': False} - no_ambiguous: pl.DataFrame = (get_all_matches(scorefile=scorefile, target=target) + no_ambiguous: pl.DataFrame = (pl.concat(get_all_matches(scorefile=scorefile, target=target)) .pipe(label_matches, params=no_flip) .collect()) @@ -63,7 +63,7 @@ def test_ambiguous_label(small_flipped_scorefile, small_target): # otherwise, ambiguous variants are kept flip_params = {'skip_flip': True, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False} - labelled = (get_all_matches(scorefile=scorefile, target=target) + labelled = (pl.concat(get_all_matches(scorefile=scorefile, target=target)) .pipe(label_matches, params=flip_params) .collect()) @@ -125,7 +125,7 @@ def duplicated_matches(small_scorefile, small_target, request) -> pl.DataFrame: params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': request.param} - return (get_all_matches(scorefile=scorefile, target=target, low_memory=False) + return (pl.concat(get_all_matches(scorefile=scorefile, target=target)) .pipe(label_matches, params=params) .collect()) @@ -136,7 +136,7 @@ def multiple_match_types(small_target, small_scorefile) -> pl.DataFrame: scorefile, target = _cast_cat(small_scorefile, small_target) params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False} - return (get_all_matches(scorefile=scorefile, target=target, low_memory=False) + return (pl.concat(get_all_matches(scorefile=scorefile, target=target)) .pipe(label_matches, params=params) .filter(pl.col('chr_name') == '2') .collect()) @@ -151,6 +151,6 @@ def duplicate_best_match(small_target, small_scorefile_no_oa) -> pl.DataFrame: scorefile, target = _cast_cat(small_scorefile_no_oa, pl.DataFrame(odd_target)) params = {'skip_flip': False, 'remove_ambiguous': False, 'remove_multiallelic': False, 'keep_first_match': False} - return (get_all_matches(scorefile=scorefile, target=target, low_memory=False) + return (pl.concat(get_all_matches(scorefile=scorefile, target=target)) .pipe(label_matches, params=params) .collect()) diff --git a/tests/match/test_match.py b/tests/match/test_match.py index b7ab82c..e9e3355 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -49,7 +49,7 @@ def test_match_strategies(small_scorefile, small_target): params = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} # check unambiguous matches - df: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) + df: pl.DataFrame = (pl.concat(get_all_matches(scorefile, target)) .pipe(label_matches, params=params) .filter(pl.col('ambiguous') == False) .collect()) @@ -58,7 +58,7 @@ def test_match_strategies(small_scorefile, small_target): # when keeping ambiguous and flipping alleles flip_params = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - flip: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) + flip: pl.DataFrame = (pl.concat(get_all_matches(scorefile, target)) .pipe(label_matches, params=flip_params) .filter(pl.col('ambiguous') == True) .collect()) @@ -71,7 +71,7 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): scorefile, target = _cast_cat(small_scorefile_no_oa, small_target) no_ambig = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - df: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) + df: pl.DataFrame = (pl.concat(get_all_matches(scorefile, target)) .pipe(label_matches, params=no_ambig) .filter(pl.col('ambiguous') == False) .collect()) @@ -81,7 +81,7 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): # check ambiguous matches ambig = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - flip: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) + flip: pl.DataFrame = (pl.concat(get_all_matches(scorefile, target)) .pipe(label_matches, ambig) .filter(pl.col('ambiguous') == True) .collect()) @@ -92,7 +92,7 @@ def test_no_oa_match(small_scorefile_no_oa, small_target): def test_flip_match(small_flipped_scorefile, small_target): scorefile, target = _cast_cat(small_flipped_scorefile, small_target) params = {'skip_flip': True, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - df: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) + df: pl.DataFrame = (pl.concat(get_all_matches(scorefile, target)) .pipe(label_matches, params=params) .collect()) @@ -102,7 +102,7 @@ def test_flip_match(small_flipped_scorefile, small_target): no_flip_params = {'skip_flip': False, 'remove_ambiguous': False, 'keep_first_match': False, 'remove_multiallelic': False} - flip: pl.DataFrame = (get_all_matches(scorefile, target, low_memory=False) + flip: pl.DataFrame = (pl.concat(get_all_matches(scorefile, target)) .pipe(label_matches, params=no_flip_params) .filter(pl.col('ambiguous') == False) .collect()) From 2aded3470a8ae1cf51c586fe5a92500901e32e25 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 10 Nov 2022 13:08:28 +0000 Subject: [PATCH 32/54] fix schema type --- pgscatalog_utils/match/write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index f4a97e0..3e11554 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -57,7 +57,7 @@ def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): def _check_column_types(matches: pl.LazyFrame): logger.debug("Checking column types") # these columns are most important for writing out - correct_schema = {'chr_name': pl.Utf8, 'chr_position': pl.UInt64, 'ID': pl.Utf8, + correct_schema = {'chr_name': pl.Categorical, 'chr_position': pl.UInt64, 'ID': pl.Utf8, 'matched_effect_allele': pl.Categorical, 'effect_weight': pl.Float64, 'effect_type': pl.Categorical, 'accession': pl.Categorical} col_types = {x: matches.schema.get(x) for x in list((matches.schema.keys() & correct_schema.keys()))} From 2d23383577591357c2368f678ca448fee4dce802 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 10 Nov 2022 13:08:38 +0000 Subject: [PATCH 33/54] improve e2e tests --- tests/match/test_match.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/match/test_match.py b/tests/match/test_match.py index e9e3355..9c2647a 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -10,19 +10,35 @@ from pgscatalog_utils.match.match_variants import match_variants +def test_match_pass(mini_scorefile, target_path, tmp_path): + out_dir = str(tmp_path.resolve()) + + args: list[str] = ['match_variants', '-s', mini_scorefile, + '-t', target_path, + '-d', 'test', + '--min_overlap', '0.5', + '--outdir', out_dir, + '--keep_ambiguous', '--keep_multiallelic'] + + + with patch('sys.argv', args): + match_variants() + + def test_match_fail(mini_scorefile, target_path, tmp_path): out_dir = str(tmp_path.resolve()) args: list[str] = ['match_variants', '-s', mini_scorefile, '-t', target_path, '-d', 'test', - '--min_overlap', 1, + '--min_overlap', '1', '--outdir', out_dir, '--keep_ambiguous', '--keep_multiallelic'] - with pytest.raises(Exception): + with pytest.raises(Exception) as excinfo: with patch('sys.argv', args): match_variants() + assert "No valid matches found" in str(excinfo.value) def _cast_cat(scorefile, target) -> tuple[pl.LazyFrame, pl.LazyFrame]: From d52f0346f5e96a61ee67826c9048cffdee77b0c7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 10 Nov 2022 13:18:04 +0000 Subject: [PATCH 34/54] update gha actions-poetry --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index bf0f138..cd3ddc8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -14,7 +14,7 @@ jobs: with: python-version: '3.10' - name: Python Poetry Action - uses: abatilo/actions-poetry@v2.1.3 + uses: abatilo/actions-poetry@v2.1.6 - name: Install run: poetry install - name: Test From 45e96505494729ea015faa8aba2f5aa74d9341c9 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 10 Nov 2022 13:43:22 +0000 Subject: [PATCH 35/54] add e2e tests for combine_matches --- pgscatalog_utils/match/match_variants.py | 11 ++--- tests/match/test_combine.py | 60 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 tests/match/test_combine.py diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 7354bc1..daca77b 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -67,7 +67,7 @@ def match_variants(): logger.debug(f"--only_match set, writing out match candidates {match_dir} and exiting") shutil.move(match_dir, args.outdir) logger.debug("Intermediate files can be processed with combine_matches") - sys.exit(0) + raise SystemExit(0) else: logger.debug("Labelling match candidates") params: dict[str, bool] = make_params_dict(args) @@ -263,17 +263,16 @@ def _check_args(args): label_error = False if args.only_match and args.keep_first_match: label_error = True - if args.only_match and args.ignore_strand_flips: + if args.only_match and args.skip_flip: label_error = True - if args.only_match and args.keep_multiallelic: + if args.only_match and args.remove_multiallelic: label_error = True if args.only_match and args.remove_ambiguous: label_error = True if label_error: - logger.critical("Invalid arguments: --only_match and --keep_first_match, --ignore_strand_flips," + logger.warning("Invalid arguments: --only_match and --keep_first_match, --ignore_strand_flips," "keep_multiallelic, or keep_ambiguous") - logger.critical("Pass these arguments to combine_matches instead") - sys.exit(1) + logger.warning("Pass these arguments to combine_matches instead") return args diff --git a/tests/match/test_combine.py b/tests/match/test_combine.py new file mode 100644 index 0000000..c63a112 --- /dev/null +++ b/tests/match/test_combine.py @@ -0,0 +1,60 @@ +import pytest +import os + +from unittest.mock import patch + +from pgscatalog_utils.match.combine_matches import combine_matches +from pgscatalog_utils.match.match_variants import match_variants + + +def test_combine_matches_pass(mini_scorefile, only_matches, tmp_path): + out_dir = str(tmp_path.resolve()) + + args: list[str] = ['combine_matches', '-s', mini_scorefile, + '-m', only_matches, + '-d', 'test', + '--outdir', out_dir, + '--min_overlap', '0.9', + '--ignore_strand_flips', + '--keep_first_match', + '--keep_multiallelic'] + + with patch('sys.argv', args): + combine_matches() + + +def test_combine_matches_fail(mini_scorefile, only_matches, tmp_path): + out_dir = str(tmp_path.resolve()) + + args: list[str] = ['combine_matches', '-s', mini_scorefile, + '-m', only_matches, + '-d', 'test', + '--outdir', out_dir, + '--min_overlap', '1.0', + '--ignore_strand_flips', + '--keep_first_match', + '--keep_multiallelic'] + + with pytest.raises(Exception) as excinfo: + with patch('sys.argv', args): + combine_matches() + + assert "No valid matches found" in str(excinfo.value) + + +@pytest.fixture +def only_matches(mini_scorefile, target_path, tmp_path): + out_dir = str(tmp_path.resolve()) + + args: list[str] = ['match_variants', '-s', mini_scorefile, + '-t', target_path, + '-d', 'test', + '--outdir', out_dir, + '--only_match'] + + with pytest.raises(SystemExit, match='0'): + with patch('sys.argv', args): + match_variants() + + return os.path.join(out_dir, 'matches', 'test_match_0.ipc.zst') + From 1aa3512f6fd58aba73a366612f7b83dc22cd8f8d Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Thu, 10 Nov 2022 14:27:48 +0000 Subject: [PATCH 36/54] Make argument order and parsing more clear --- pgscatalog_utils/match/combine_matches.py | 12 ++++++------ pgscatalog_utils/match/match_variants.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 7735f9b..ad4fbdc 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -5,7 +5,7 @@ from pgscatalog_utils import config from pgscatalog_utils.match.label import make_params_dict, label_matches -from pgscatalog_utils.match.match_variants import log_and_write, add_label_args +from pgscatalog_utils.match.match_variants import log_and_write, add_match_args from pgscatalog_utils.match.read import read_scorefile logger = logging.getLogger(__name__) @@ -51,20 +51,20 @@ def _parse_args(args=None): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataset', dest='dataset', required=True, help=' Label for target genomic dataset') - parser.add_argument('--min_overlap', dest='min_overlap', required=True, - type=float, help=' Minimum proportion of variants to match before error') parser.add_argument('-s', '--scorefile', dest='scorefile', required=True, help=' Path to scorefile') - parser.add_argument('--split', dest='split', default=False, action='store_true', - help=' Split scorefile per chromosome?') parser.add_argument('-m', '--matches', dest='matches', required=True, nargs='+', help=' List of match files') + parser.add_argument('--min_overlap', dest='min_overlap', required=True, + type=float, help=' Minimum proportion of variants to match before error') + parser = add_match_args(parser) # params for labelling matches parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') + parser.add_argument('--split', dest='split', default=False, action='store_true', + help=' Split scorefile per chromosome?') parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') - parser = add_label_args(parser) return parser.parse_args(args) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index daca77b..5aa156e 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -206,18 +206,18 @@ def _parse_args(args=None): help=" Only match, then write intermediate files, don't make scoring files") parser.add_argument('--min_overlap', dest='min_overlap', required=False, type=float, help=' Minimum proportion of variants to match before error') - parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) - parser.add_argument('--split', dest='split', default=False, action='store_true', - help=' Split scorefile per chromosome?') + parser = add_match_args(parser) # params for labelling matches parser.add_argument('--outdir', dest='outdir', required=True, help=' Output directory') + parser.add_argument('--split', dest='split', default=False, action='store_true', + help=' Split scorefile per chromosome?') + parser.add_argument('-n', dest='n_threads', default=1, help=' n threads for matching', type=int) parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') - parser = add_label_args(parser) return _check_args(parser.parse_args(args)) -def add_label_args(parser): +def add_match_args(parser): parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false', help=''' Flag to force the program to keep variants with ambiguous alleles, (e.g. A/T and G/C SNPs), which are normally @@ -261,13 +261,13 @@ def _check_args(args): logger.critical("Invalid arguments: --only_match and --split (pick one!)") sys.exit(1) label_error = False - if args.only_match and args.keep_first_match: + if args.only_match and ('--keep_first_match' in sys.argv): label_error = True - if args.only_match and args.skip_flip: + if args.only_match and ('--ignore_strand_flips' in sys.argv): label_error = True - if args.only_match and args.remove_multiallelic: + if args.only_match and ('--keep_multiallelic' in sys.argv): label_error = True - if args.only_match and args.remove_ambiguous: + if args.only_match and ('--keep_ambiguous' in sys.argv): label_error = True if label_error: logger.warning("Invalid arguments: --only_match and --keep_first_match, --ignore_strand_flips," From c01aafcdf2ab88ad81e067765cf1da107b6acd01 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Thu, 10 Nov 2022 14:37:31 +0000 Subject: [PATCH 37/54] Simplify label error parsing --- pgscatalog_utils/match/match_variants.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 5aa156e..dec6b7f 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -260,16 +260,8 @@ def _check_args(args): # not writing scoring files, so split output doesn't make sense logger.critical("Invalid arguments: --only_match and --split (pick one!)") sys.exit(1) - label_error = False - if args.only_match and ('--keep_first_match' in sys.argv): - label_error = True - if args.only_match and ('--ignore_strand_flips' in sys.argv): - label_error = True - if args.only_match and ('--keep_multiallelic' in sys.argv): - label_error = True - if args.only_match and ('--keep_ambiguous' in sys.argv): - label_error = True - if label_error: + if any([x in sys.argv for x in ['--keep_first_match', '--ignore_strand_flips', + '--keep_multiallelic', '--keep_ambiguous']]): logger.warning("Invalid arguments: --only_match and --keep_first_match, --ignore_strand_flips," "keep_multiallelic, or keep_ambiguous") logger.warning("Pass these arguments to combine_matches instead") From 182715f7247948b727eb111b6b0110f760577fe1 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Thu, 10 Nov 2022 16:29:31 +0000 Subject: [PATCH 38/54] Fix output and naming convention for logfile --- pgscatalog_utils/scorefile/combine_scorefiles.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 3a4c9ed..2c8421c 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -16,7 +16,6 @@ from pgscatalog_utils.scorefile.write import write_scorefile -json_logs_filename = 'combined_log.json' headers2logs = [ 'pgs_id', 'pgs_name', @@ -48,7 +47,12 @@ def combine_scorefiles(): # Score header logs - init score_logs = {} - json_logs_file = os.path.dirname(args.outfile)+'/'+json_logs_filename + dir_output = os.path.dirname(args.outfile) + if dir_output == '': + dir_output = './' + elif dir_output.endswith('/') is False: + dir_output += '/' + json_logs_file = dir_output + args.logfile for x in paths: # Read scorefile df and header @@ -175,6 +179,9 @@ def _parse_args(args=None) -> argparse.Namespace: default='combined.txt', help=' Output path to combined long scorefile ' '[ will compress output if filename ends with .gz ]') + parser.add_argument('-l', '--logfile', dest='logfile', default='log_combined.json', + help=' Name for the log file (score metadata) for combined scores.' + '[ will write to identical directory as combined scorefile]') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help=' Extra logging information') return parser.parse_args(args) From 0fabd9c397126c704d9726696f3fb339ebcd1d5c Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Fri, 11 Nov 2022 10:24:24 +0000 Subject: [PATCH 39/54] fix scorefile output + add test to check format --- pgscatalog_utils/match/write.py | 3 ++- tests/match/test_combine.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 3e11554..6974740 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -96,7 +96,7 @@ def _write_text_pgzip(df: pl.LazyFrame, fout: str, append: bool = False): mode = 'wb' with pgzip.open(fout, mode, thread=config.N_THREADS) as f: - df.collect().write_csv(f) + df.collect().write_csv(f, sep='\t') def _pivot_score(df: pl.LazyFrame, chrom: str) -> pl.LazyFrame: @@ -112,6 +112,7 @@ def _pivot_score(df: pl.LazyFrame, chrom: str) -> pl.LazyFrame: columns="accession") .rename({"matched_effect_allele": "effect_allele"}) .fill_null(strategy="zero") + .drop("effect_type") .lazy()) diff --git a/tests/match/test_combine.py b/tests/match/test_combine.py index c63a112..4a17566 100644 --- a/tests/match/test_combine.py +++ b/tests/match/test_combine.py @@ -1,5 +1,8 @@ +import gzip + import pytest import os +import polars as pl from unittest.mock import patch @@ -22,6 +25,13 @@ def test_combine_matches_pass(mini_scorefile, only_matches, tmp_path): with patch('sys.argv', args): combine_matches() + # and double check the output format of scorefiles + with gzip.open(os.path.join(tmp_path, 'test_ALL_additive_0.scorefile.gz')) as f: + scores = pl.read_csv(f, sep='\t') + # pl.Categorical vs pl.Utf8 doesn't matter for this test + assert scores.schema == {'ID': pl.Utf8, 'effect_allele': pl.Utf8, 'PGS001229_22': pl.Float64} + + def test_combine_matches_fail(mini_scorefile, only_matches, tmp_path): out_dir = str(tmp_path.resolve()) From 8b7f6964d3d5b65eec0e3dd6af6922a7c3f7571a Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 11 Nov 2022 11:25:49 +0000 Subject: [PATCH 40/54] Log: extract pgp_id and fix ',' in reported trait bug. TODO: fix when |-bar demimited headers reappear --- pgscatalog_utils/scorefile/combine_scorefiles.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 2c8421c..3e998c6 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -18,11 +18,14 @@ headers2logs = [ 'pgs_id', + 'pgp_id', 'pgs_name', 'genome_build', 'variants_number', + 'trait_reported', 'trait_efo', 'trait_mapped', + 'weight_type', 'citation' ] headers2logs_harmonisation = [ @@ -114,7 +117,8 @@ def combine_scorefiles(): for header in headers2logs: header_val = h.get(header) if header.startswith('trait'): - header_val = header_val.split(',') + if (header == 'trait_mapped') and (len(h.get('trait_efo').split(',')) > 1): + header_val = header_val.split(',') score_header[header] = header_val # Other header information score_header['columns'] = list(score.columns) @@ -134,7 +138,7 @@ def combine_scorefiles(): # Write Score header logs file with open(json_logs_file, 'w') as fp: - json.dump(score_logs, fp) + json.dump(score_logs, fp, indent=4) def _description_text() -> str: From 6edcc11d304f4268af235d70bd316c3ffae661d9 Mon Sep 17 00:00:00 2001 From: Laurent Gil Date: Fri, 11 Nov 2022 11:34:59 +0000 Subject: [PATCH 41/54] Fix to separate traits --- pgscatalog_utils/scorefile/combine_scorefiles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 3e998c6..982acf5 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -117,8 +117,8 @@ def combine_scorefiles(): for header in headers2logs: header_val = h.get(header) if header.startswith('trait'): - if (header == 'trait_mapped') and (len(h.get('trait_efo').split(',')) > 1): - header_val = header_val.split(',') + if (header == 'trait_mapped') and (len(h.get('trait_efo').split('|')) > 1): + header_val = header_val.split('|') score_header[header] = header_val # Other header information score_header['columns'] = list(score.columns) From 67f0f6751cdea29c70363c6e3ed1706a1a160b5f Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 11 Nov 2022 11:54:08 +0000 Subject: [PATCH 42/54] Use correct delimited for mapped/efo traits --- pgscatalog_utils/scorefile/combine_scorefiles.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 982acf5..c370ea4 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -116,9 +116,8 @@ def combine_scorefiles(): # Scoring file header information for header in headers2logs: header_val = h.get(header) - if header.startswith('trait'): - if (header == 'trait_mapped') and (len(h.get('trait_efo').split('|')) > 1): - header_val = header_val.split('|') + if header in ['trait_efo', 'trait_mapped']: + header_val = header_val.split('|') score_header[header] = header_val # Other header information score_header['columns'] = list(score.columns) From 774af6f09dcc2b0fb3290ce059d302b37282f402 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 11 Nov 2022 14:43:05 +0000 Subject: [PATCH 43/54] Add variant number to log for custom scoring files --- pgscatalog_utils/scorefile/combine_scorefiles.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index c370ea4..9465484 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -60,6 +60,7 @@ def combine_scorefiles(): for x in paths: # Read scorefile df and header h, score = load_scorefile(x) + score_shape_original = score.shape if score.empty: logger.critical(f"Empty scorefile {x} detected! Please check the input data") @@ -116,7 +117,7 @@ def combine_scorefiles(): # Scoring file header information for header in headers2logs: header_val = h.get(header) - if header in ['trait_efo', 'trait_mapped']: + if (header in ['trait_efo', 'trait_mapped']) and (header_val is not None): header_val = header_val.split('|') score_header[header] = header_val # Other header information @@ -134,6 +135,8 @@ def combine_scorefiles(): if hm_header.startswith('HmPOS_match'): hm_header_val = json.loads(hm_header_val) score_header[hm_header] = hm_header_val + if score_header['variants_number'] is None: + score_header['variants_number'] = score_shape_original[0] # Write Score header logs file with open(json_logs_file, 'w') as fp: From 6174e85302aab220a478a317689aa7af52ebd57a Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 14 Nov 2022 12:56:27 +0000 Subject: [PATCH 44/54] check if --outdir exists --- pgscatalog_utils/config.py | 6 ++++++ pgscatalog_utils/match/combine_matches.py | 1 + pgscatalog_utils/match/match_variants.py | 1 + 3 files changed, 8 insertions(+) diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index 1cff092..41573fd 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -14,6 +14,12 @@ logger = logging.getLogger(__name__) +def check_outdir(outdir): + if os.path.exists(outdir): + logger.critical("--outdir already exists, bailing out") + raise SystemExit(1) + + def setup_cleaning(): logger.debug(F"Temporary directory set up: {TEMPDIR.name}") atexit.register(tempdir.cleanup) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index ad4fbdc..5b1c343 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -15,6 +15,7 @@ def combine_matches(): args = _parse_args() config.set_logging_level(args.verbose) config.setup_polars_threads(args.n_threads) + config.check_outdir(args.outdir) config.OUTDIR = args.outdir with pl.StringCache(): diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index dec6b7f..621ae16 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -22,6 +22,7 @@ def match_variants(): args = _parse_args() config.set_logging_level(args.verbose) config.setup_polars_threads(args.n_threads) + config.check_outdir(args.outdir) config.setup_cleaning() with pl.StringCache(): From 8d101f92026dc39dc197eda96717beba0e3cc1c5 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 14 Nov 2022 13:36:59 +0000 Subject: [PATCH 45/54] just check if <--outdir>/matches exists -_- --- pgscatalog_utils/config.py | 5 +++-- pgscatalog_utils/match/combine_matches.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index 41573fd..c233acd 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -15,8 +15,9 @@ def check_outdir(outdir): - if os.path.exists(outdir): - logger.critical("--outdir already exists, bailing out") + if os.path.exists(os.path.join(outdir, "matches")): + logger.critical("--outdir/matches already exists, bailing out") + logger.critical("Please choose a different --outdir") raise SystemExit(1) diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 5b1c343..ad4fbdc 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -15,7 +15,6 @@ def combine_matches(): args = _parse_args() config.set_logging_level(args.verbose) config.setup_polars_threads(args.n_threads) - config.check_outdir(args.outdir) config.OUTDIR = args.outdir with pl.StringCache(): From ceb27ecdac85faa5d3e63e68513aaaf5887cd7d5 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 15 Nov 2022 16:21:50 +0000 Subject: [PATCH 46/54] move tempdir --- pgscatalog_utils/config.py | 16 +++++++++++----- pgscatalog_utils/match/match_variants.py | 4 ++-- pgscatalog_utils/match/read.py | 6 +++--- pgscatalog_utils/target.py | 24 ++++++++++++------------ 4 files changed, 28 insertions(+), 22 deletions(-) diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index c233acd..de6cd1b 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -9,16 +9,22 @@ N_THREADS: int = 1 # dummy value, is reset by args.n_threads (default: 1) OUTDIR: str = "." # dummy value, reset by args.outdir -TEMPDIR: tempfile.TemporaryDirectory = tempfile.TemporaryDirectory() +TEMPDIR: tempfile.TemporaryDirectory logger = logging.getLogger(__name__) def check_outdir(outdir): - if os.path.exists(os.path.join(outdir, "matches")): - logger.critical("--outdir/matches already exists, bailing out") - logger.critical("Please choose a different --outdir") - raise SystemExit(1) + for i in ['matches', 'work']: + d: str = os.path.join(outdir, i) + if os.path.exists(d): + logger.critical(f"{d} already exists, bailing out") + logger.critical("Please choose a different --outdir") + raise SystemExit(1) + + global TEMPDIR + os.mkdir(os.path.join(outdir, "work")) + TEMPDIR = tempfile.TemporaryDirectory(dir=os.path.join(outdir, "work")) def setup_cleaning(): diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 621ae16..15df6f2 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -106,9 +106,9 @@ def _materialise_matches(matches: list[list[pl.LazyFrame]], dataset: str, low_me for i, match in enumerate(matches): fout = tempdir.get_tmp_path("matches", f"{dataset}_match_{i}.ipc.zst") if low_memory: - pl.concat([x.collect() for x in match]).write_ipc(fout) + pl.concat([x.collect() for x in match]).write_ipc(fout, compression='zstd') else: - pl.concat(pl.collect_all(match)).write_ipc(fout) + pl.concat(pl.collect_all(match)).write_ipc(fout, compression='zstd') match_dir: str = tempdir.get_tmp_path("matches", "") ldf: pl.LazyFrame = pl.scan_ipc(match_dir + "*.ipc.zst", memory_map=False) return match_dir, ldf diff --git a/pgscatalog_utils/match/read.py b/pgscatalog_utils/match/read.py index a77f037..43ed39b 100644 --- a/pgscatalog_utils/match/read.py +++ b/pgscatalog_utils/match/read.py @@ -33,9 +33,9 @@ def read_scorefile(path: str, chrom: typing.Union[str, None]) -> pl.LazyFrame: # parse CSV and write to temporary feather file # enforce laziness! scanning is very fast and saves memory - fout: str = get_tmp_path("scorefile", "scorefile.ipc") - (pl.read_csv(path, sep='\t', dtype=dtypes).write_ipc(fout)) - ldf: pl.LazyFrame = pl.scan_ipc(fout) + fout: str = get_tmp_path("scorefile", "scorefile.ipc.zst") + (pl.read_csv(path, sep='\t', dtype=dtypes).write_ipc(fout, compression='zstd')) + ldf: pl.LazyFrame = pl.scan_ipc(fout, memory_map=False) if chrom is not None: logger.debug(f"--chrom set, filtering scoring file to chromosome {chrom}") diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index ab52971..4f9faa3 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -66,7 +66,7 @@ def _read_compressed(self) -> pl.LazyFrame: dtypes = _get_col_dtypes(self.file_format) col_idxs, new_col_names = _default_cols(self.file_format) - fn: str = pathlib.Path(self.path).stem + ".ipc" + fn: str = pathlib.Path(self.path).stem + ".ipc.zst" fout = get_tmp_path("input", fn) (pl.read_csv(reader, sep='\t', has_header=False, comment_char='#', @@ -74,15 +74,15 @@ def _read_compressed(self) -> pl.LazyFrame: columns=col_idxs, new_columns=new_col_names, n_threads=config.N_THREADS) - .write_ipc(fout)) - return pl.scan_ipc(fout) + .write_ipc(fout, compression='zstd')) + return pl.scan_ipc(fout, memory_map=False) def _read_uncompressed(self) -> pl.LazyFrame: """ Read an uncompressed target as quickly as possible. Uses up to 16GB RAM on 1000 genomes pvar. """ dtypes = _get_col_dtypes(self.file_format) col_idxs, new_col_names = _default_cols(self.file_format) - fn: str = pathlib.Path(self.path).stem + ".ipc" + fn: str = pathlib.Path(self.path).stem + ".ipc.zst" fout: str = get_tmp_path("input", fn) (pl.read_csv(self.path, sep='\t', has_header=False, comment_char='#', @@ -90,8 +90,8 @@ def _read_uncompressed(self) -> pl.LazyFrame: columns=col_idxs, new_columns=new_col_names, n_threads=config.N_THREADS) - .write_ipc(fout)) - return pl.scan_ipc(fout) + .write_ipc(fout, compression='zstd')) + return pl.scan_ipc(fout, memory_map=False) def _read_uncompressed_chunks(self) -> pl.LazyFrame: """ Read a CSV using a BufferedReader in batches to reduce memory usage. @@ -113,7 +113,7 @@ def _read_uncompressed_chunks(self) -> pl.LazyFrame: if not line_batch: break - fn: str = str(batch_n) + ".ipc" + fn: str = str(batch_n) + ".ipc.zst" fout: str = get_tmp_path("input", fn) (pl.read_csv(line_batch, sep='\t', has_header=False, comment_char='#', @@ -121,12 +121,12 @@ def _read_uncompressed_chunks(self) -> pl.LazyFrame: columns=col_idxs, new_columns=new_col_names, n_threads=config.N_THREADS) - .write_ipc(fout)) + .write_ipc(fout, compression='zstd')) batch_n += 1 gc.collect() # just to be safe logger.debug(f"{batch_n} batches staged in temporary directory {config.TEMPDIR}") - return pl.scan_ipc(os.path.join(config.TEMPDIR.name, "input", "*.ipc")) + return pl.scan_ipc(os.path.join(config.TEMPDIR.name, "input", "*.ipc.zst"), memory_map=False) def _read_compressed_chunks(self) -> pl.LazyFrame: """ Like _read_uncompressed_chunks, but read chunks of bytes and handle incomplete rows @@ -154,7 +154,7 @@ def _read_compressed_chunks(self) -> pl.LazyFrame: else: row_chunk = chunk[:end] - fn: str = str(n_chunks) + ".ipc" + fn: str = str(n_chunks) + ".ipc.zst" fout: str = get_tmp_path("input", fn) (pl.read_csv(row_chunk, sep='\t', has_header=False, comment_char='#', @@ -162,14 +162,14 @@ def _read_compressed_chunks(self) -> pl.LazyFrame: columns=columns, new_columns=new_col_names, n_threads=config.N_THREADS) - .write_ipc(fout)) + .write_ipc(fout, compression='zstd')) chunk_buffer = b''.join([chunk_buffer, chunk[end:]]) n_chunks += 1 gc.collect() # just to be safe logger.debug(f"{n_chunks} chunks") # write_size will change n_chunks - return pl.scan_ipc(os.path.join(config.TEMPDIR.name, "input", "*.ipc")) + return pl.scan_ipc(os.path.join(config.TEMPDIR.name, "input", "*.ipc.zst")) def _get_col_dtypes(file_format): From a93a32f0edbcb189995e95fc162e23c1ba29c0c8 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 15 Nov 2022 16:40:19 +0000 Subject: [PATCH 47/54] stop chatty logs re: mmap --- pgscatalog_utils/config.py | 4 ++-- pgscatalog_utils/target.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index de6cd1b..ad4c3ca 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -16,10 +16,10 @@ def check_outdir(outdir): for i in ['matches', 'work']: - d: str = os.path.join(outdir, i) + d: str = os.path.abspath(os.path.join(outdir, i)) if os.path.exists(d): logger.critical(f"{d} already exists, bailing out") - logger.critical("Please choose a different --outdir") + logger.critical("Please choose a different --outdir or clean up") raise SystemExit(1) global TEMPDIR diff --git a/pgscatalog_utils/target.py b/pgscatalog_utils/target.py index 4f9faa3..c318d56 100644 --- a/pgscatalog_utils/target.py +++ b/pgscatalog_utils/target.py @@ -169,7 +169,7 @@ def _read_compressed_chunks(self) -> pl.LazyFrame: gc.collect() # just to be safe logger.debug(f"{n_chunks} chunks") # write_size will change n_chunks - return pl.scan_ipc(os.path.join(config.TEMPDIR.name, "input", "*.ipc.zst")) + return pl.scan_ipc(os.path.join(config.TEMPDIR.name, "input", "*.ipc.zst"), memory_map=False) def _get_col_dtypes(file_format): From 785684f4b282ee488962161b58a8b84e96d5d9c7 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 16 Nov 2022 10:39:17 +0000 Subject: [PATCH 48/54] fix tempdir with combine_matches --- pgscatalog_utils/config.py | 2 +- pgscatalog_utils/match/combine_matches.py | 1 + pgscatalog_utils/match/match_variants.py | 2 +- pgscatalog_utils/match/tempdir.py | 3 +++ tests/match/test_combine.py | 1 - 5 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index ad4c3ca..8dd1e65 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) -def check_outdir(outdir): +def setup_outdir(outdir): for i in ['matches', 'work']: d: str = os.path.abspath(os.path.join(outdir, i)) if os.path.exists(d): diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index ad4fbdc..6e771c4 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -15,6 +15,7 @@ def combine_matches(): args = _parse_args() config.set_logging_level(args.verbose) config.setup_polars_threads(args.n_threads) + config.setup_outdir(args.outdir) config.OUTDIR = args.outdir with pl.StringCache(): diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index 15df6f2..c2c315f 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -22,7 +22,7 @@ def match_variants(): args = _parse_args() config.set_logging_level(args.verbose) config.setup_polars_threads(args.n_threads) - config.check_outdir(args.outdir) + config.setup_outdir(args.outdir) config.setup_cleaning() with pl.StringCache(): diff --git a/pgscatalog_utils/match/tempdir.py b/pgscatalog_utils/match/tempdir.py index e937b99..f03e5b8 100644 --- a/pgscatalog_utils/match/tempdir.py +++ b/pgscatalog_utils/match/tempdir.py @@ -46,3 +46,6 @@ def cleanup(): """ logger.debug(f"Cleaning up tempdir path {config.TEMPDIR}") config.TEMPDIR.cleanup() + workdir = os.path.dirname(config.TEMPDIR.name) + logger.debug(f"Cleaning up work {workdir}") + os.remove(workdir) diff --git a/tests/match/test_combine.py b/tests/match/test_combine.py index 4a17566..411a098 100644 --- a/tests/match/test_combine.py +++ b/tests/match/test_combine.py @@ -32,7 +32,6 @@ def test_combine_matches_pass(mini_scorefile, only_matches, tmp_path): assert scores.schema == {'ID': pl.Utf8, 'effect_allele': pl.Utf8, 'PGS001229_22': pl.Float64} - def test_combine_matches_fail(mini_scorefile, only_matches, tmp_path): out_dir = str(tmp_path.resolve()) From eeb0681a84ffd0f86468d883b23361911547a55c Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 16 Nov 2022 12:09:45 +0000 Subject: [PATCH 49/54] fix workdir --- pgscatalog_utils/config.py | 16 +++++++++++----- pgscatalog_utils/match/combine_matches.py | 2 +- pgscatalog_utils/match/tempdir.py | 3 --- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pgscatalog_utils/config.py b/pgscatalog_utils/config.py index 8dd1e65..b5f842a 100644 --- a/pgscatalog_utils/config.py +++ b/pgscatalog_utils/config.py @@ -14,17 +14,23 @@ logger = logging.getLogger(__name__) -def setup_outdir(outdir): - for i in ['matches', 'work']: - d: str = os.path.abspath(os.path.join(outdir, i)) +def setup_tmpdir(outdir, combine=False): + if combine: + work_dir = "work_combine" + dirs = [work_dir] + else: + work_dir = "work_match" + dirs = [work_dir, "matches"] + + for d in dirs: if os.path.exists(d): logger.critical(f"{d} already exists, bailing out") logger.critical("Please choose a different --outdir or clean up") raise SystemExit(1) global TEMPDIR - os.mkdir(os.path.join(outdir, "work")) - TEMPDIR = tempfile.TemporaryDirectory(dir=os.path.join(outdir, "work")) + os.mkdir(os.path.join(outdir, work_dir)) + TEMPDIR = tempfile.TemporaryDirectory(dir=os.path.join(outdir, work_dir)) def setup_cleaning(): diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py index 6e771c4..0e4ea17 100644 --- a/pgscatalog_utils/match/combine_matches.py +++ b/pgscatalog_utils/match/combine_matches.py @@ -15,7 +15,7 @@ def combine_matches(): args = _parse_args() config.set_logging_level(args.verbose) config.setup_polars_threads(args.n_threads) - config.setup_outdir(args.outdir) + config.setup_tmpdir(args.outdir, combine=True) config.OUTDIR = args.outdir with pl.StringCache(): diff --git a/pgscatalog_utils/match/tempdir.py b/pgscatalog_utils/match/tempdir.py index f03e5b8..e937b99 100644 --- a/pgscatalog_utils/match/tempdir.py +++ b/pgscatalog_utils/match/tempdir.py @@ -46,6 +46,3 @@ def cleanup(): """ logger.debug(f"Cleaning up tempdir path {config.TEMPDIR}") config.TEMPDIR.cleanup() - workdir = os.path.dirname(config.TEMPDIR.name) - logger.debug(f"Cleaning up work {workdir}") - os.remove(workdir) From a8481a3f5e9881d7939c27a0ee9779516ee46557 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Wed, 16 Nov 2022 12:09:50 +0000 Subject: [PATCH 50/54] fix match_variants not respecting --outdir --- pgscatalog_utils/match/match_variants.py | 7 ++++-- pgscatalog_utils/match/write.py | 4 +++- tests/match/test_combine.py | 2 ++ tests/match/test_match.py | 29 +++++++++++++++++++++--- 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py index c2c315f..8e0b446 100644 --- a/pgscatalog_utils/match/match_variants.py +++ b/pgscatalog_utils/match/match_variants.py @@ -1,5 +1,6 @@ import argparse import logging +import os import shutil import sys import textwrap @@ -22,8 +23,9 @@ def match_variants(): args = _parse_args() config.set_logging_level(args.verbose) config.setup_polars_threads(args.n_threads) - config.setup_outdir(args.outdir) + config.setup_tmpdir(args.outdir) config.setup_cleaning() + config.OUTDIR = args.outdir with pl.StringCache(): scorefile: pl.LazyFrame = read_scorefile(path=args.scorefile, chrom=args.chrom) @@ -96,7 +98,8 @@ def log_and_write(matches: pl.LazyFrame, scorefile: pl.LazyFrame, dataset: str, check_log_count(summary_log=summary_log, scorefile=scorefile) write_log(df=big_log, prefix=dataset, chrom=None, outdir=args.outdir) - summary_log.collect().write_csv(f"{dataset}_summary.csv") + dout = os.path.abspath(config.OUTDIR) + summary_log.collect().write_csv(os.path.join(dout, f"{dataset}_summary.csv")) def _materialise_matches(matches: list[list[pl.LazyFrame]], dataset: str, low_memory: bool) -> tuple[str, pl.LazyFrame]: diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 6974740..5c47b99 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -73,7 +73,9 @@ def _write_split(deduplicated: dict[str: tuple[int, pl.LazyFrame]], chrom: str, # pivoting is !! _expensive_ !! (it collects the lazyframe) pivoted: pl.LazyFrame = _pivot_score(et_df, chrom) - fout = os.path.join(config.OUTDIR, f"{dataset}_{chrom}_{effect_type}_{i}.scorefile.gz") + + dout = os.path.abspath(config.OUTDIR) + fout = os.path.join(dout, f"{dataset}_{chrom}_{effect_type}_{i}.scorefile.gz") _write_text_pgzip(pivoted, fout) diff --git a/tests/match/test_combine.py b/tests/match/test_combine.py index 411a098..8ada4b6 100644 --- a/tests/match/test_combine.py +++ b/tests/match/test_combine.py @@ -24,6 +24,8 @@ def test_combine_matches_pass(mini_scorefile, only_matches, tmp_path): with patch('sys.argv', args): combine_matches() + assert os.path.exists(os.path.join(out_dir, "test_ALL_additive_0.scorefile.gz")) + assert os.path.exists(os.path.join(out_dir, "test_log.csv.gz")) # and double check the output format of scorefiles with gzip.open(os.path.join(tmp_path, 'test_ALL_additive_0.scorefile.gz')) as f: diff --git a/tests/match/test_match.py b/tests/match/test_match.py index 9c2647a..4ade32e 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -1,5 +1,5 @@ """ Test that match strategies return the expected match results""" - +import os from unittest.mock import patch import polars as pl @@ -10,20 +10,43 @@ from pgscatalog_utils.match.match_variants import match_variants +def test_only_match_pass(mini_scorefile, target_path, tmp_path): + out_dir = str(tmp_path.resolve()) + + args: list[str] = ['match_variants', '-s', mini_scorefile, + '-t', target_path, + '-d', 'test', + # '--min_overlap', '0.5', + '--only_match', + '--outdir', out_dir] + # '--keep_ambiguous', '--keep_multiallelic'] + + + with patch('sys.argv', args): + with pytest.raises(SystemExit) as se: + match_variants() + assert se.value.code == 0 + + assert os.path.exists(os.path.join(out_dir, "matches/test_match_0.ipc.zst")) + + def test_match_pass(mini_scorefile, target_path, tmp_path): out_dir = str(tmp_path.resolve()) args: list[str] = ['match_variants', '-s', mini_scorefile, '-t', target_path, '-d', 'test', - '--min_overlap', '0.5', + '--min_overlap', '0.95', '--outdir', out_dir, '--keep_ambiguous', '--keep_multiallelic'] - with patch('sys.argv', args): match_variants() + assert os.path.exists(os.path.join(out_dir, "test_summary.csv")) + assert os.path.exists(os.path.join(out_dir, "test_log.csv.gz")) + assert os.path.exists(os.path.join(out_dir, "test_ALL_additive_0.scorefile.gz")) + def test_match_fail(mini_scorefile, target_path, tmp_path): out_dir = str(tmp_path.resolve()) From 0b1de5aa0f004a09265c41b5d7a98ce8df0f2413 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Thu, 17 Nov 2022 09:46:48 +0000 Subject: [PATCH 51/54] be slightly less lazy --- pgscatalog_utils/match/write.py | 4 ++++ tests/match/test_match.py | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 5c47b99..409d39c 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -28,6 +28,10 @@ def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): additive: pl.LazyFrame dominant: pl.LazyFrame recessive: pl.LazyFrame + + # collect to cache! + matches: pl.LazyFrame = matches.collect().lazy() + if split: chroms: list[str] = matches.select("chr_name").unique().collect().get_column("chr_name").to_list() for chrom in chroms: diff --git a/tests/match/test_match.py b/tests/match/test_match.py index 4ade32e..1acfc70 100644 --- a/tests/match/test_match.py +++ b/tests/match/test_match.py @@ -21,7 +21,6 @@ def test_only_match_pass(mini_scorefile, target_path, tmp_path): '--outdir', out_dir] # '--keep_ambiguous', '--keep_multiallelic'] - with patch('sys.argv', args): with pytest.raises(SystemExit) as se: match_variants() From 86aac2233f0a5aeef365869914978550b0bd57fc Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Fri, 18 Nov 2022 10:40:25 +0000 Subject: [PATCH 52/54] Collect matches by chromosome to reduce RAM? --- pgscatalog_utils/match/write.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index 409d39c..e44d764 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -29,14 +29,11 @@ def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): dominant: pl.LazyFrame recessive: pl.LazyFrame - # collect to cache! - matches: pl.LazyFrame = matches.collect().lazy() - if split: chroms: list[str] = matches.select("chr_name").unique().collect().get_column("chr_name").to_list() for chrom in chroms: - # 1. filter by chromosome - chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom) + # 1. filter by chromosome & collect to cache! + chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom).collect().lazy() # 2. split by effect type additive, dominant, recessive = _split_effect_type(chrom_df) @@ -47,6 +44,9 @@ def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): # 4. pivot and write! _write_split(deduped, chrom, dataset) else: + # collect to cache! + matches: pl.LazyFrame = matches.collect().lazy() + # 1. split by effect type additive, dominant, recessive = _split_effect_type(matches) From 4761906ab7f134ff6c2a4b02f4deb17e7344a887 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 21 Nov 2022 10:32:52 +0000 Subject: [PATCH 53/54] collect minimal columns for writing only --- pgscatalog_utils/match/write.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index e44d764..ac53cb8 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -29,11 +29,17 @@ def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): dominant: pl.LazyFrame recessive: pl.LazyFrame + # collect and cache minimum required columns + min_cols: list[str] = ['accession', 'effect_type', 'chr_name', 'ID', 'matched_effect_allele', 'effect_weight'] + matches: pl.LazyFrame = (matches.select(min_cols) + .collect() + .lazy()) + if split: chroms: list[str] = matches.select("chr_name").unique().collect().get_column("chr_name").to_list() for chrom in chroms: - # 1. filter by chromosome & collect to cache! - chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom).collect().lazy() + # 1. filter by chromosome + chrom_df: pl.LazyFrame = matches.filter(pl.col('chr_name') == chrom) # 2. split by effect type additive, dominant, recessive = _split_effect_type(chrom_df) @@ -44,9 +50,6 @@ def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): # 4. pivot and write! _write_split(deduped, chrom, dataset) else: - # collect to cache! - matches: pl.LazyFrame = matches.collect().lazy() - # 1. split by effect type additive, dominant, recessive = _split_effect_type(matches) From c43e85d0c800e49b4421db90b57225a7161b21f4 Mon Sep 17 00:00:00 2001 From: smlmbrt Date: Mon, 21 Nov 2022 11:51:06 +0000 Subject: [PATCH 54/54] Write full log as csv --- pgscatalog_utils/match/write.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgscatalog_utils/match/write.py b/pgscatalog_utils/match/write.py index e44d764..fb1051e 100644 --- a/pgscatalog_utils/match/write.py +++ b/pgscatalog_utils/match/write.py @@ -20,7 +20,7 @@ def write_log(df: pl.LazyFrame, prefix: str, chrom: typing.Union[str, None], out logger.warning(f"Overwriting log that already exists: {fout}") os.remove(fout) - _write_text_pgzip(df=df, fout=fout) + _write_text_pgzip(df=df, sep = ',', fout=fout) def write_scorefiles(matches: pl.LazyFrame, split: bool, dataset: str): @@ -83,7 +83,7 @@ def _write_split(deduplicated: dict[str: tuple[int, pl.LazyFrame]], chrom: str, _write_text_pgzip(pivoted, fout) -def _write_text_pgzip(df: pl.LazyFrame, fout: str, append: bool = False): +def _write_text_pgzip(df: pl.LazyFrame, fout: str, sep: str = '\t', append: bool = False): """ Write a df to a text file (e.g. CSV / TSV) using parallel gzip, optionally appending to an existing file Notes: @@ -102,7 +102,7 @@ def _write_text_pgzip(df: pl.LazyFrame, fout: str, append: bool = False): mode = 'wb' with pgzip.open(fout, mode, thread=config.N_THREADS) as f: - df.collect().write_csv(f, sep='\t') + df.collect().write_csv(f, sep=sep) def _pivot_score(df: pl.LazyFrame, chrom: str) -> pl.LazyFrame: