LDpred2: Build 37/38 behavior/documentation (#202)

* LDpred2: Build 37/38 behavior/documentation Fixes #189 * LDpred2: Build 37/38 behavior/documentation Fixes #189 * the ways of R... * Update CHANGELOG.md * Update version.py
comorment · Oct 17, 2023 · deed453 · deed453
1 parent 92b8fcb
commit deed453
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,11 @@ If MD5 sum is not listed for a certain release then it means that the container
 ### Misc
 
 * Miscellaneous goes here
+## [1.3.8] - 2023-10-17
+
+### Fixed
+
+* Added `--genomic-build hg18/hg19/hg38` option to `ldpred2.R` to use correct LD reference meta file ``pos`` column name
 
 ## [1.3.7] - 2023-10-17
 

diff --git a/scripts/pgs/LDpred2/README.md b/scripts/pgs/LDpred2/README.md
@@ -31,19 +31,19 @@ yielding:
 
 ```
 usage: ldpred2.R [--] [--help] [--out-merge] [--geno-impute-zero]
-       [--opts OPTS] [--geno-file-rds GENO-FILE-RDS] [--sumstats
-       SUMSTATS] [--out OUT] [--out-merge-ids OUT-MERGE-IDS]
-       [--file-keep-snps FILE-KEEP-SNPS] [--ld-file LD-FILE]
-       [--ld-meta-file LD-META-FILE] [--chr2use CHR2USE] [--col-chr
-       COL-CHR] [--col-snp-id COL-SNP-ID] [--col-A1 COL-A1] [--col-A2
-       COL-A2] [--col-bp COL-BP] [--col-stat COL-STAT] [--col-stat-se
-       COL-STAT-SE] [--col-pvalue COL-PVALUE] [--col-n COL-N]
-       [--stat-type STAT-TYPE] [--effective-sample-size
+       [--merge-by-rsid] [--opts OPTS] [--geno-file-rds GENO-FILE-RDS]
+       [--sumstats SUMSTATS] [--out OUT] [--out-merge-ids
+       OUT-MERGE-IDS] [--file-keep-snps FILE-KEEP-SNPS] [--ld-file
+       LD-FILE] [--ld-meta-file LD-META-FILE] [--chr2use CHR2USE]
+       [--col-chr COL-CHR] [--col-snp-id COL-SNP-ID] [--col-A1 COL-A1]
+       [--col-A2 COL-A2] [--col-bp COL-BP] [--col-stat COL-STAT]
+       [--col-stat-se COL-STAT-SE] [--col-pvalue COL-PVALUE] [--col-n
+       COL-N] [--stat-type STAT-TYPE] [--effective-sample-size
        EFFECTIVE-SAMPLE-SIZE] [--n-cases N-CASES] [--n-controls
        N-CONTROLS] [--name-score NAME-SCORE] [--hyper-p-length
        HYPER-P-LENGTH] [--hyper-p-max HYPER-P-MAX] [--ldpred-mode
        LDPRED-MODE] [--cores CORES] [--set-seed SET-SEED]
-       [--merge-by-rsid MERGE-BY-RSID]
+       [--genomic-build GENOMIC-BUILD]
 
 Calculate polygenic scores using ldpred2
 
@@ -89,6 +89,15 @@ $RSCRIPT createBackingFile.R EUR.nomiss.bed EUR.nomiss.rds
 $RSCRIPT ldpred2.R --geno-file-rds EUR.nomiss.rds ...
 ```
 
+### Note on genomic builds
+
+By default the LDpred2 scripts assume that the genotype data and summary statistics use build GRCh37/hg19, 
+but there are no explicit checks for consistent builds across input files.
+If the genotype data and summary statistics file use another build, the ``--genomic-build <build>`` flag should be used to specify build version,
+parsing either `hg18`, `hg19` or `hg38` as argument.
+As of now, setting this argument will affect the loading of LD metadata only, but not the genotype data or summary statistics.
+A symptom of using the wrong build is that the script will match only a small fraction of variants between the genotype data, summary statistics file and/or LD reference data.
+
 ### Optional: Estimating linkage disequillibrium (LD)
 
 LDpred2 uses the LD structure when calculating polygenic scores. By default, the LDpred2.R script uses LD structure based on European samples provided by the LDpred2 authors.

diff --git a/scripts/pgs/LDpred2/ldpred2.R b/scripts/pgs/LDpred2/ldpred2.R
@@ -53,6 +53,7 @@ par <- add_argument(par, "--ldpred-mode", help='Ether "auto" or "inf" (infinites
 par <- add_argument(par, "--cores", help="Number of CPU cores to use, otherwise use the available number of cores minus 1", default=nb_cores())
 par <- add_argument(par, '--set-seed', help="Set a seed for reproducibility", nargs=1)
 par <- add_argument(par, "--merge-by-rsid", help="Merge using rsid (the default is to merge by chr:bp:a1:a2 codes).", flag=TRUE)
+par <- add_argument(par, "--genomic-build", help="Genomic build to use. Either hg19, hg18 or hg38", default="hg19", nargs=1)
 
 parsed <- parse_args(par)
 
@@ -152,6 +153,21 @@ if (genoImputeZero) {
 cat('\n### Reading LD reference meta-file from ', fileMetaLD, '\n')
 map_ldref <- readRDS(fileMetaLD)
 
+# rename pos column in map_ldref if another genomic build is assumed:
+if (!parsed$genomic_build %in% c('hg18', 'hg19', 'hg38')) stop('Genomic build should be one of "hg19", "hg18", "hg38"')
+if (parsed$genomic_build == 'hg_38') {
+  cat('Renaming "pos_hg38" column in LD reference meta info as "pos"\n')
+  map_ldref$pos <- map_ldref$pos_hg38
+  map_ldref$pos_hg38 <- NULL
+} else if (parsed$genomic_build == 'hg_18') {
+  cat('Renaming "pos_hg18" column in LD reference meta info as "pos"\n')
+  map_ldref$pos <- map_ldref$pos_hg18
+  map_ldref$pos_hg18 <- NULL
+} else {
+  # pass
+}
+
+
 cat('\n### Reading summary statistics', fileSumstats,'\n')
 sumstats <- bigreadr::fread2(fileSumstats)
 cat('Loaded', nrow(sumstats), 'SNPs\n')

diff --git a/version/version.py b/version/version.py
@@ -2,7 +2,7 @@
 _MINOR = "3"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "7"
+_PATCH = "8"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""