Skip to content

Commit

Permalink
LDpred2/createBackingFile.R: process .bgen files (#200)
Browse files Browse the repository at this point in the history
* `LDpred2/createBackingFile.R`: support BGEN file format
Fixes #199

* test works

* bumped version.py to 1.3.7

* changed date
  • Loading branch information
espenhgn authored Oct 17, 2023
1 parent c7b0745 commit 92b8fcb
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 5 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ If MD5 sum is not listed for a certain release then it means that the container

* Miscellaneous goes here

## [1.3.7] - 2023-10-17

### Added

* Added a feature to read and convert BGEN (.bgen) files to ``scripts/pgs/LDpred2/createBackingFile.R``

## [1.3.6] - 2023-08-17

### Fixed
Expand Down
14 changes: 11 additions & 3 deletions scripts/pgs/LDpred2/createBackingFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
library(bigsnpr, quietly = T)
library(tools)
library(argparser, quietly=T)
par <- arg_parser('Create bigSNPR rds/bk (backing) files from plink bed files')
par <- add_argument(par, 'file-input', help='The bed input file')
par <- arg_parser('Create bigSNPR rds/bk (backing) files from PLINK (.bed) or BGEN (.bgen)) files')
par <- add_argument(par, 'file-input', help='The input file (.bed/.bgen file ending)')
par <- add_argument(par, 'file-output', help='The output basename of files')
par <- add_argument(par, '--file-snp-list', help='Text file containing list of SNPs to extract from .bgen file (format <chr>_<pos>_<a1>_<a2>)', default = NULL)
parsed <- parse_args(par)
if (!file.exists(parsed$file_input)) stop(parsed$file_input, ' does not exist!')
# If the user passes a file.rds, the output will not be file.rds.rds
Expand All @@ -18,5 +19,12 @@ if (file.exists(outputFileRDS)) {
quit('no') # Exit without saving the workspace
}
cat('Processing', parsed$file_input, '\n')
res <- snp_readBed(parsed$file_input, backingfile = baseName)
file_ext = tolower(file_ext(parsed$file_input))
if (file_ext == 'bed') {
res <- snp_readBed(parsed$file_input, backingfile = baseName)
} else if (file_ext == 'bgen') {
res <- snp_readBGEN(parsed$file_input, backingfile = baseName, list_snp_id = list(scan(parsed$file_snp_list, character())))
} else {
stop('Unknown file extension: ', file_ext)
}
cat('Created', baseName, ' (.rds, .bk)\n')
12 changes: 11 additions & 1 deletion tests/test_LDpred2/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ export fileOutputSNPR=$DIR_DATA/public-data3.rds
export fileKeepSNPS=/REF/hapmap3/w_hm3.justrs
export fileOut=$DIR_TESTS/output/public-data.score

# BGEN files
export fileBGEN=$DIR_DATA/example.bgen
export fileBGENasRDS=$DIR_DATA/example.rds
export fileSNPlist=$DIR_DATA/example.snps

# copy .bgen files
cp $DIR_REFERENCE/examples/regenie/example.bgen $fileBGEN
cp $DIR_REFERENCE/examples/regenie/example.bgen.bgi $fileBGEN.bgi

### For imputation testing
# Copy some plink files
cp $DIR_REFERENCE/examples/prsice2/EUR.bed $DIR_DATA/
Expand All @@ -40,9 +49,10 @@ fileImpute=$DIR_DATA/EUR
# Imputed file
fileImputed=$DIR_DATA/EUR_imputed


# Create shortcut environment variable for Rscript
export RSCRIPT="singularity exec -B $DIR_BASE:$DIR_BASE -B $DIR_REF_LDPRED:/ldpred2_ref -B $DIR_REFERENCE:/REF $DIR_SIF/r.sif Rscript"
export BGENIX="singularity exec -B $DIR_BASE:$DIR_BASE -B $DIR_REF_LDPRED:/ldpred2_ref -B $DIR_REFERENCE:/REF $DIR_SIF/gwas.sif bgenix"
export PYTHON="singularity exec -B $DIR_BASE:$DIR_BASE -B $DIR_REF_LDPRED:/ldpred2_ref -B $DIR_REFERENCE:/REF $DIR_SIF/python3.sif python"

# The different modes to run (affects runs of scripts/extended.sh)
LDPRED_MODES="inf auto"
Expand Down
8 changes: 8 additions & 0 deletions tests/test_LDpred2/scripts/backingfile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,11 @@ if [ $? -eq 1 ]; then echo "$dump"; exit; fi

dump=$( $RSCRIPT $DIR_SCRIPTS/createBackingFile.R $fileInputGeno $fileOutputSNPR )
if [ $? -eq 1 ]; then echo "$dump"; exit; fi

# check that .bgen file can be converted
# First, create data/snp_list_id.txt file
dump=$( $BGENIX -g $fileBGEN -incl-range 1:0- -list > $fileSNPlist )
dump=$( $PYTHON -c "import os; import pandas as pd; df = pd.read_csv('$fileSNPlist', delim_whitespace=True, skipfooter=1, skiprows=[0], engine='python'); df = df[['chromosome', 'position', 'first_allele', 'alternative_alleles']]; df.to_csv('$fileSNPlist', index=False, sep='_', header=False)" )
# Then, create backing file
dump=$( $RSCRIPT $DIR_SCRIPTS/createBackingFile.R $fileBGEN $fileBGENasRDS --file-snp-list $fileSNPlist )
if [ $? -eq 1 ]; then echo "$dump"; exit; fi
2 changes: 1 addition & 1 deletion version/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_MINOR = "3"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "6"
_PATCH = "7"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down

0 comments on commit 92b8fcb

Please sign in to comment.