update descriptions in args

PGScatalog · Aug 23, 2022 · c8ef7c9 · c8ef7c9
1 parent 309fcf5
commit c8ef7c9
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -28,6 +28,8 @@ $ combine_scorefiles -s PGS*.txt.gz -o combined.txt
 $ match_variants -s combined.txt -t <example.pvar> --min_overlap 0.75 --outdir .
 ```
 
+More details are available using the `--help` parameter.
+
 ## Install from source
 
 Requirements:
@@ -44,6 +46,7 @@ $ pip install --user dist/*.whl
 ```
 
 ## Credits
+
 The `pgscatalog_utils` package is developed as part of the **Polygenic Score (PGS) Catalog** 
 ([www.PGSCatalog.org](https://www.PGSCatalog.org)) project, a collaboration between the 
 University of Cambridge’s Department of Public Health and Primary Care (Michael Inouye, Samuel Lambert, Laurent Gil) 

diff --git a/pgscatalog_utils/download/download_scorefile.py b/pgscatalog_utils/download/download_scorefile.py
@@ -1,11 +1,11 @@
-import logging
 import argparse
+import logging
 import os
 import shutil
+import textwrap
 from contextlib import closing
 from functools import reduce
 from urllib import request as request
-import sys
 
 from pgscatalog_utils.download.publication import query_publication
 from pgscatalog_utils.download.score import get_url
@@ -15,24 +15,8 @@
 logger = logging.getLogger(__name__)
 
 
-def parse_args(args=None) -> argparse.Namespace:
-    parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Download scoring files from the PGS Catalog')
-    parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)')
-    parser.add_argument('-t', '--efo', dest='efo', nargs='+',
-                        help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
-    parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
-    parser.add_argument('-b', '--build', dest='build',
-                        help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
-    parser.add_argument('-o', '--outdir', dest='outdir', required=True,
-                        default='scores/',
-                        help='<Required> Output directory to store downloaded files')
-    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
-                        help='Extra logging information')
-    return parser.parse_args(args)
-
-
 def download_scorefile() -> None:
-    args = parse_args()
+    args = _parse_args()
     set_logging_level(args.verbose)
     _check_args(args)
     _mkdir(args.outdir)
@@ -96,5 +80,47 @@ def _check_args(args):
                 raise Exception
 
 
+def _description_text() -> str:
+    return textwrap.dedent('''\
+    Download a set of scoring files from the PGS Catalog using PGS
+    Scoring IDs, traits, or publication IDs.
+    
+    The PGS Catalog API is queried to get a list of scoring file
+    URLs. Scoring files are downloaded via FTP to a specified
+    directory. PGS Catalog scoring files are staged with the name:
+    
+        {PGS_ID}_hmPOS_{genome_build}.txt.gz
+    
+    These harmonised scoring files contain genomic coordinates,
+    remapped from author-submitted information such as rsids.
+   ''')
+
+
+def _epilog_text() -> str:
+    return textwrap.dedent('''\
+    download_scorefiles will skip downloading a scoring file if it
+    already exists in the download directory. This can be useful if
+    the download process is interrupted and needs to be restarted
+    later. You can track download progress with the verbose flag.    
+   ''')
+
+
+def _parse_args(args=None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-i', '--pgs', nargs='+', dest='pgs', help='PGS Catalog ID(s) (e.g. PGS000001)')
+    parser.add_argument('-t', '--efo', dest='efo', nargs='+',
+                        help='Traits described by an EFO term(s) (e.g. EFO_0004611)')
+    parser.add_argument('-p', '--pgp', dest='pgp', help='PGP publication ID(s) (e.g. PGP000007)', nargs='+')
+    parser.add_argument('-b', '--build', dest='build',
+                        help='Download Harmonized Scores with Positions in Genome build: GRCh37 or GRCh38')
+    parser.add_argument('-o', '--outdir', dest='outdir', required=True,
+                        default='scores/',
+                        help='<Required> Output directory to store downloaded files')
+    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
+                        help='<Optional> Extra logging information')
+    return parser.parse_args(args)
+
+
 if __name__ == "__main__":
     download_scorefile()
diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
@@ -1,5 +1,6 @@
 import argparse
 import logging
+import textwrap
 from glob import glob
 
 import polars as pl
@@ -103,10 +104,58 @@ def _match_single_target(target_path: str, scorefile: pl.DataFrame, remove_multi
     return pl.concat(matches)
 
 
+def _description_text() -> str:
+    return textwrap.dedent('''\
+    Match variants from a combined scoring file against a set of
+    target genomes from the same fileset, and output scoring files
+    compatible with the plink2 --score function.
+    
+    A combined scoring file is the output of the combine_scorefiles
+    script. It has the following structure:
+    
+        | chr_name | chr_position | ... | accession |
+        | -------- | ------------ | --- | --------- |
+        | 1        | 1            | ... | PGS000802 |
+    
+    The combined scoring file is in long format, with one row per
+    variant for each scoring file (accession). This structure is
+    different to the PGS Catalog standard, because the long format
+    makes matching faster and simpler.
+    
+    Target genomes can be in plink1 bim format or plink2 pvar
+    format. Variant IDs should be unique.
+    
+    Only one set of target genomes should be matched at a time. Don't
+    try to match target genomes from different plink
+    filesets. Matching against a set of chromosomes from the same
+    fileset is OK (see --split). 
+   ''')
+
+
+def _epilog_text() -> str:
+    return textwrap.dedent('''\
+    match_variants will output at least one scoring file in a
+    format compatible with the plink2 --score function. This
+    output might be split across different files to ensure each
+    variant ID, effect allele, and effect type appears only once
+    in each file. Output files have the pattern:
+
+        {dataset}_{chromosome}_{effect_type}_{n}.scorefile.
+
+    If multiple chromosomes are combined into a single file (i.e. not
+    --split), then {chromosome} is replaced with 'ALL'. Once the
+    scorefiles are used to calculate a score with plink2, the .sscore
+    files will need to be aggregated to calculate a single polygenic
+    score for each dataset, sample, and accession (scoring file). The
+    PGS Catalog Calculator does this automatically.
+    ''')
+
+
 def _parse_args(args=None):
-    parser = argparse.ArgumentParser(description='Match variants from a combined scoring file against target variants')
+    parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument('-d', '--dataset', dest='dataset', required=True,
-                        help='<Required> Label for target genomic dataset (e.g. "-d thousand_genomes")')
+                        help='<Required> Label for target genomic dataset')
     parser.add_argument('-s', '--scorefiles', dest='scorefile', required=True,
                         help='<Required> Combined scorefile path (output of read_scorefiles.py)')
     parser.add_argument('-t', '--target', dest='target', required=True,
@@ -120,15 +169,18 @@ def _parse_args(args=None):
     parser.add_argument('-m', '--min_overlap', dest='min_overlap', required=True,
                         type=float, help='<Required> Minimum proportion of variants to match before error')
     parser.add_argument('--keep_ambiguous', dest='remove_ambiguous', action='store_false',
-                        help='Flag to force the program to keep variants with ambiguous alleles, (e.g. A/T and G/C '
-                             'SNPs), which are normally excluded (default: false). In this case the program proceeds '
-                             'assuming that the genotype data is on the same strand as the GWAS whose summary '
-                             'statistics were used to construct the score.'),
+                        help='''<Optional> Flag to force the program to keep variants with
+                        ambiguous alleles, (e.g. A/T and G/C SNPs), which are normally
+                        excluded (default: false). In this case the program proceeds
+                        assuming that the genotype data is on the same strand as the
+                        GWAS whose summary statistics were used to construct the score.
+    			        ''')
     parser.add_argument('--keep_multiallelic', dest='remove_multiallelic', action='store_false',
-                        help='Flag to allow matching to multiallelic variants (default: false).')
+                        help='<Optional> Flag to allow matching to multiallelic variants (default: false).')
     parser.add_argument('--ignore_strand_flips', dest='skip_flip', action='store_true',
-                        help='Flag to not consider matched variants that may be reported on the opposite strand. '
-                             'Default behaviour is to flip/complement unmatched variants and check if they match.')
+                        help='''<Optional> Flag to not consider matched variants that may be reported 
+                        on the opposite strand.  Default behaviour is to flip/complement unmatched variants and check if
+                        they match.''')
     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
                         help='<Optional> Extra logging information')
     return parser.parse_args(args)

diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py
@@ -1,6 +1,8 @@
 import argparse
-import sys
 import logging
+import sys
+import textwrap
+
 import pandas as pd
 
 from pgscatalog_utils.log_config import set_logging_level
@@ -11,32 +13,8 @@
 from pgscatalog_utils.scorefile.write import write_scorefile
 
 
-def parse_args(args=None) -> argparse.Namespace:
-    parser: argparse.ArgumentParser = argparse.ArgumentParser(description='Combine multiple scoring files')
-    parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+',
-                        help='<Required> Scorefile path (wildcard * is OK)', required=True)
-    parser.add_argument('--liftover', dest='liftover',
-                        help='<Optional> Convert scoring file variants to target genome build?', action='store_true')
-    parser.add_argument('-t', '--target_build', dest='target_build', help='Build of target genome <GRCh37 / GRCh38>',
-                        required='--liftover' in sys.argv)
-    parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files',
-                        required="--liftover" in sys.argv)
-    parser.add_argument('-m', '--min_lift', dest='min_lift',
-                        help='If liftover, minimum proportion of variants lifted over',
-                        required="--liftover" in sys.argv, default=0.95, type=float)
-    parser.add_argument('--drop_missing', dest='drop_missing', action='store_true',
-                        help='Drop variants with missing information (chr/pos) and '
-                             'non-standard alleles from the output file.')
-    parser.add_argument('-o', '--outfile', dest='outfile', required=True,
-                        default='combined.txt',
-                        help='<Required> Output path to combined long scorefile')
-    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
-                        help='<Optional> Extra logging information')
-    return parser.parse_args(args)
-
-
 def combine_scorefiles():
-    args = parse_args()
+    args = _parse_args()
 
     logger = logging.getLogger(__name__)
     set_logging_level(args.verbose)
@@ -61,3 +39,45 @@ def _read_and_melt(path, drop_missing: bool = False):
 
 if __name__ == "__main__":
     combine_scorefiles()
+
+
+def _description_text() -> str:
+    return textwrap.dedent('''\
+    Combine multiple scoring files in PGS Catalog format to a 'long'
+    table, and optionally liftover genomic coordinates to GRCh37 or
+    GRCh38. Custom scorefiles in PGS Catalog format can be combined
+    with PGS Catalog scoring files. The program can accept a mix of
+    unharmonised and harmonised PGS Catalog data.     
+    ''')
+
+
+def _epilog_text() -> str:
+    return textwrap.dedent('''\
+    The long table is used to simplify intersecting variants in target
+    genomes and the scoring files with the match_variants program.    
+    ''')
+
+
+def _parse_args(args=None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+',
+                        help='<Required> Scorefile path (wildcard * is OK)', required=True)
+    parser.add_argument('--liftover', dest='liftover',
+                        help='<Optional> Convert scoring file variants to target genome build?', action='store_true')
+    parser.add_argument('-t', '--target_build', dest='target_build', help='Build of target genome <GRCh37 / GRCh38>',
+                        required='--liftover' in sys.argv)
+    parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files',
+                        required="--liftover" in sys.argv)
+    parser.add_argument('-m', '--min_lift', dest='min_lift',
+                        help='If liftover, minimum proportion of variants lifted over',
+                        required="--liftover" in sys.argv, default=0.95, type=float)
+    parser.add_argument('--drop_missing', dest='drop_missing', action='store_true',
+                        help='Drop variants with missing information (chr/pos) and '
+                             'non-standard alleles from the output file.')
+    parser.add_argument('-o', '--outfile', dest='outfile', required=True,
+                        default='combined.txt',
+                        help='<Required> Output path to combined long scorefile')
+    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
+                        help='<Optional> Extra logging information')
+    return parser.parse_args(args)