Skip to content
This repository has been archived by the owner on Jan 21, 2025. It is now read-only.

Commit

Permalink
update descriptions in args
Browse files Browse the repository at this point in the history
  • Loading branch information
nebfield committed Aug 23, 2022
1 parent 309fcf5 commit c8ef7c9
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 54 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ $ combine_scorefiles -s PGS*.txt.gz -o combined.txt
$ match_variants -s combined.txt -t <example.pvar> --min_overlap 0.75 --outdir .
```

More details are available using the `--help` parameter.

## Install from source

Requirements:
Expand All @@ -44,6 +46,7 @@ $ pip install --user dist/*.whl
```

## Credits

The `pgscatalog_utils` package is developed as part of the **Polygenic Score (PGS) Catalog**
([www.PGSCatalog.org](https://www.PGSCatalog.org)) project, a collaboration between the
University of Cambridge’s Department of Public Health and Primary Care (Michael Inouye, Samuel Lambert, Laurent Gil)
Expand Down
64 changes: 45 additions & 19 deletions pgscatalog_utils/download/download_scorefile.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import logging
import argparse
import logging
import os
import shutil
import textwrap
from contextlib import closing
from functools import reduce
from urllib import request as request
import sys

from pgscatalog_utils.download.publication import query_publication
from pgscatalog_utils.download.score import get_url
Expand All @@ -15,24 +15,8 @@
logger = logging.getLogger(__name__)


def parse_args(args=None) -> argparse.Namespace:
parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files from the PGS Catalog')
parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)')
parser.add_argument('-t', '--efo', dest='efo', nargs='+',
help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
parser.add_argument('-b', '--build', dest='build',
help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
parser.add_argument('-o', '--outdir', dest='outdir', required=True,
default='scores/',
help='<Required> Output directory to store downloaded files')
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help='Extra logging information')
return parser.parse_args(args)


def download_scorefile() -> None:
args = parse_args()
args = _parse_args()
set_logging_level(args.verbose)
_check_args(args)
_mkdir(args.outdir)
Expand Down Expand Up @@ -96,5 +80,47 @@ def _check_args(args):
raise Exception


def _description_text() -> str:
return textwrap.dedent('''\
Download a set of scoring files from the PGS Catalog using PGS
Scoring IDs, traits, or publication IDs.
The PGS Catalog API is queried to get a list of scoring file
URLs. Scoring files are downloaded via FTP to a specified
directory. PGS Catalog scoring files are staged with the name:
{PGS_ID}_hmPOS_{genome_build}.txt.gz
These harmonised scoring files contain genomic coordinates,
remapped from author-submitted information such as rsids.
''')


def _epilog_text() -> str:
return textwrap.dedent('''\
download_scorefiles will skip downloading a scoring file if it
already exists in the download directory. This can be useful if
the download process is interrupted and needs to be restarted
later. You can track download progress with the verbose flag.
''')


def _parse_args(args=None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)')
parser.add_argument('-t', '--efo', dest='efo', nargs='+',
help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
parser.add_argument('-b', '--build', dest='build',
help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
parser.add_argument('-o', '--outdir', dest='outdir', required=True,
default='scores/',
help='<Required> Output directory to store downloaded files')
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help='<Optional> Extra logging information')
return parser.parse_args(args)


if __name__ == "__main__":
download_scorefile()
70 changes: 61 additions & 9 deletions pgscatalog_utils/match/match_variants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import logging
import textwrap
from glob import glob

import polars as pl
Expand Down Expand Up @@ -103,10 +104,58 @@ def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multi
return pl.concat(matches)


def _description_text() -> str:
return textwrap.dedent('''\
Match variants from a combined scoring file against a set of
target genomes from the same fileset, and output scoring files
compatible with the plink2 --score function.
A combined scoring file is the output of the combine_scorefiles
script. It has the following structure:
| chr_name | chr_position | ... | accession |
| -------- | ------------ | --- | --------- |
| 1 | 1 | ... | PGS000802 |
The combined scoring file is in long format, with one row per
variant for each scoring file (accession). This structure is
different to the PGS Catalog standard, because the long format
makes matching faster and simpler.
Target genomes can be in plink1 bim format or plink2 pvar
format. Variant IDs should be unique.
Only one set of target genomes should be matched at a time. Don't
try to match target genomes from different plink
filesets. Matching against a set of chromosomes from the same
fileset is OK (see --split).
''')


def _epilog_text() -> str:
return textwrap.dedent('''\
match_variants will output at least one scoring file in a
format compatible with the plink2 --score function. This
output might be split across different files to ensure each
variant ID, effect allele, and effect type appears only once
in each file. Output files have the pattern:
{dataset}_{chromosome}_{effect_type}_{n}.scorefile.
If multiple chromosomes are combined into a single file (i.e. not
--split), then {chromosome} is replaced with 'ALL'. Once the
scorefiles are used to calculate a score with plink2, the .sscore
files will need to be aggregated to calculate a single polygenic
score for each dataset, sample, and accession (scoring file). The
PGS Catalog Calculator does this automatically.
''')


def _parse_args(args=None):
parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants')
parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--dataset', dest='dataset', required=True,
help='<Required> Label for target genomic dataset (e.g. "-d thousand_genomes")')
help='<Required> Label for target genomic dataset')
parser.add_argument('-s', '--scorefiles', dest='scorefile', required=True,
help='<Required> Combined scorefile path (output of read_scorefiles.py)')
parser.add_argument('-t', '--target', dest='target', required=True,
Expand All @@ -120,15 +169,18 @@ def _parse_args(args=None):
parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True,
type=float, help='<Required> Minimum proportion of variants to match before error')
parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false',
help='Flag to force the program to keep variants with ambiguous alleles, (e.g. A/T and G/C '
'SNPs), which are normally excluded (default: false). In this case the program proceeds '
'assuming that the genotype data is on the same strand as the GWAS whose summary '
'statistics were used to construct the score.'),
help='''<Optional> Flag to force the program to keep variants with
ambiguous alleles, (e.g. A/T and G/C SNPs), which are normally
excluded (default: false). In this case the program proceeds
assuming that the genotype data is on the same strand as the
GWAS whose summary statistics were used to construct the score.
''')
parser.add_argument('--keep_multiallelic', dest='remove_multiallelic', action='store_false',
help='Flag to allow matching to multiallelic variants (default: false).')
help='<Optional> Flag to allow matching to multiallelic variants (default: false).')
parser.add_argument('--ignore_strand_flips', dest='skip_flip', action='store_true',
help='Flag to not consider matched variants that may be reported on the opposite strand. '
'Default behaviour is to flip/complement unmatched variants and check if they match.')
help='''<Optional> Flag to not consider matched variants that may be reported
on the opposite strand. Default behaviour is to flip/complement unmatched variants and check if
they match.''')
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help='<Optional> Extra logging information')
return parser.parse_args(args)
Expand Down
72 changes: 46 additions & 26 deletions pgscatalog_utils/scorefile/combine_scorefiles.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import argparse
import sys
import logging
import sys
import textwrap

import pandas as pd

from pgscatalog_utils.log_config import set_logging_level
Expand All @@ -11,32 +13,8 @@
from pgscatalog_utils.scorefile.write import write_scorefile


def parse_args(args=None) -> argparse.Namespace:
parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Combine multiple scoring files')
parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+',
help='<Required> Scorefile path (wildcard * is OK)', required=True)
parser.add_argument('--liftover', dest='liftover',
help='<Optional> Convert scoring file variants to target genome build?', action='store_true')
parser.add_argument('-t', '--target_build', dest='target_build', help='Build of target genome <GRCh37 / GRCh38>',
required='--liftover' in sys.argv)
parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files',
required="--liftover" in sys.argv)
parser.add_argument('-m', '--min_lift', dest='min_lift',
help='If liftover, minimum proportion of variants lifted over',
required="--liftover" in sys.argv, default=0.95, type=float)
parser.add_argument('--drop_missing', dest='drop_missing', action='store_true',
help='Drop variants with missing information (chr/pos) and '
'non-standard alleles from the output file.')
parser.add_argument('-o', '--outfile', dest='outfile', required=True,
default='combined.txt',
help='<Required> Output path to combined long scorefile')
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help='<Optional> Extra logging information')
return parser.parse_args(args)


def combine_scorefiles():
args = parse_args()
args = _parse_args()

logger = logging.getLogger(__name__)
set_logging_level(args.verbose)
Expand All @@ -61,3 +39,45 @@ def _read_and_melt(path, drop_missing: bool = False):

if __name__ == "__main__":
combine_scorefiles()


def _description_text() -> str:
return textwrap.dedent('''\
Combine multiple scoring files in PGS Catalog format to a 'long'
table, and optionally liftover genomic coordinates to GRCh37 or
GRCh38. Custom scorefiles in PGS Catalog format can be combined
with PGS Catalog scoring files. The program can accept a mix of
unharmonised and harmonised PGS Catalog data.
''')


def _epilog_text() -> str:
return textwrap.dedent('''\
The long table is used to simplify intersecting variants in target
genomes and the scoring files with the match_variants program.
''')


def _parse_args(args=None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+',
help='<Required> Scorefile path (wildcard * is OK)', required=True)
parser.add_argument('--liftover', dest='liftover',
help='<Optional> Convert scoring file variants to target genome build?', action='store_true')
parser.add_argument('-t', '--target_build', dest='target_build', help='Build of target genome <GRCh37 / GRCh38>',
required='--liftover' in sys.argv)
parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files',
required="--liftover" in sys.argv)
parser.add_argument('-m', '--min_lift', dest='min_lift',
help='If liftover, minimum proportion of variants lifted over',
required="--liftover" in sys.argv, default=0.95, type=float)
parser.add_argument('--drop_missing', dest='drop_missing', action='store_true',
help='Drop variants with missing information (chr/pos) and '
'non-standard alleles from the output file.')
parser.add_argument('-o', '--outfile', dest='outfile', required=True,
default='combined.txt',
help='<Required> Output path to combined long scorefile')
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help='<Optional> Extra logging information')
return parser.parse_args(args)

0 comments on commit c8ef7c9

Please sign in to comment.