diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ca15627..cfdbe03d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,11 @@ If MD5 sum is not listed for a certain release then it means that the container ### Misc * Miscellaneous goes here +## [1.3.8] - 2023-10-17 + +### Fixed + +* Added `--genomic-build hg18/hg19/hg38` option to `ldpred2.R` to use correct LD reference meta file ``pos`` column name ## [1.3.7] - 2023-10-17 diff --git a/scripts/pgs/LDpred2/README.md b/scripts/pgs/LDpred2/README.md index bc8f5477..422e8a4e 100644 --- a/scripts/pgs/LDpred2/README.md +++ b/scripts/pgs/LDpred2/README.md @@ -31,19 +31,19 @@ yielding: ``` usage: ldpred2.R [--] [--help] [--out-merge] [--geno-impute-zero] - [--opts OPTS] [--geno-file-rds GENO-FILE-RDS] [--sumstats - SUMSTATS] [--out OUT] [--out-merge-ids OUT-MERGE-IDS] - [--file-keep-snps FILE-KEEP-SNPS] [--ld-file LD-FILE] - [--ld-meta-file LD-META-FILE] [--chr2use CHR2USE] [--col-chr - COL-CHR] [--col-snp-id COL-SNP-ID] [--col-A1 COL-A1] [--col-A2 - COL-A2] [--col-bp COL-BP] [--col-stat COL-STAT] [--col-stat-se - COL-STAT-SE] [--col-pvalue COL-PVALUE] [--col-n COL-N] - [--stat-type STAT-TYPE] [--effective-sample-size + [--merge-by-rsid] [--opts OPTS] [--geno-file-rds GENO-FILE-RDS] + [--sumstats SUMSTATS] [--out OUT] [--out-merge-ids + OUT-MERGE-IDS] [--file-keep-snps FILE-KEEP-SNPS] [--ld-file + LD-FILE] [--ld-meta-file LD-META-FILE] [--chr2use CHR2USE] + [--col-chr COL-CHR] [--col-snp-id COL-SNP-ID] [--col-A1 COL-A1] + [--col-A2 COL-A2] [--col-bp COL-BP] [--col-stat COL-STAT] + [--col-stat-se COL-STAT-SE] [--col-pvalue COL-PVALUE] [--col-n + COL-N] [--stat-type STAT-TYPE] [--effective-sample-size EFFECTIVE-SAMPLE-SIZE] [--n-cases N-CASES] [--n-controls N-CONTROLS] [--name-score NAME-SCORE] [--hyper-p-length HYPER-P-LENGTH] [--hyper-p-max HYPER-P-MAX] [--ldpred-mode LDPRED-MODE] [--cores CORES] [--set-seed SET-SEED] - [--merge-by-rsid MERGE-BY-RSID] + [--genomic-build GENOMIC-BUILD] Calculate polygenic scores using ldpred2 @@ -89,6 +89,15 @@ $RSCRIPT createBackingFile.R EUR.nomiss.bed EUR.nomiss.rds $RSCRIPT ldpred2.R --geno-file-rds EUR.nomiss.rds ... ``` +### Note on genomic builds + +By default the LDpred2 scripts assume that the genotype data and summary statistics use build GRCh37/hg19, +but there are no explicit checks for consistent builds across input files. +If the genotype data and summary statistics file use another build, the ``--genomic-build `` flag should be used to specify build version, +parsing either `hg18`, `hg19` or `hg38` as argument. +As of now, setting this argument will affect the loading of LD metadata only, but not the genotype data or summary statistics. +A symptom of using the wrong build is that the script will match only a small fraction of variants between the genotype data, summary statistics file and/or LD reference data. + ### Optional: Estimating linkage disequillibrium (LD) LDpred2 uses the LD structure when calculating polygenic scores. By default, the LDpred2.R script uses LD structure based on European samples provided by the LDpred2 authors. diff --git a/scripts/pgs/LDpred2/ldpred2.R b/scripts/pgs/LDpred2/ldpred2.R index 771153a4..a5f0881f 100644 --- a/scripts/pgs/LDpred2/ldpred2.R +++ b/scripts/pgs/LDpred2/ldpred2.R @@ -53,6 +53,7 @@ par <- add_argument(par, "--ldpred-mode", help='Ether "auto" or "inf" (infinites par <- add_argument(par, "--cores", help="Number of CPU cores to use, otherwise use the available number of cores minus 1", default=nb_cores()) par <- add_argument(par, '--set-seed', help="Set a seed for reproducibility", nargs=1) par <- add_argument(par, "--merge-by-rsid", help="Merge using rsid (the default is to merge by chr:bp:a1:a2 codes).", flag=TRUE) +par <- add_argument(par, "--genomic-build", help="Genomic build to use. Either hg19, hg18 or hg38", default="hg19", nargs=1) parsed <- parse_args(par) @@ -152,6 +153,21 @@ if (genoImputeZero) { cat('\n### Reading LD reference meta-file from ', fileMetaLD, '\n') map_ldref <- readRDS(fileMetaLD) +# rename pos column in map_ldref if another genomic build is assumed: +if (!parsed$genomic_build %in% c('hg18', 'hg19', 'hg38')) stop('Genomic build should be one of "hg19", "hg18", "hg38"') +if (parsed$genomic_build == 'hg_38') { + cat('Renaming "pos_hg38" column in LD reference meta info as "pos"\n') + map_ldref$pos <- map_ldref$pos_hg38 + map_ldref$pos_hg38 <- NULL +} else if (parsed$genomic_build == 'hg_18') { + cat('Renaming "pos_hg18" column in LD reference meta info as "pos"\n') + map_ldref$pos <- map_ldref$pos_hg18 + map_ldref$pos_hg18 <- NULL +} else { + # pass +} + + cat('\n### Reading summary statistics', fileSumstats,'\n') sumstats <- bigreadr::fread2(fileSumstats) cat('Loaded', nrow(sumstats), 'SNPs\n') diff --git a/version/version.py b/version/version.py index 69693d52..8da07eb4 100644 --- a/version/version.py +++ b/version/version.py @@ -2,7 +2,7 @@ _MINOR = "3" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "7" +_PATCH = "8" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""