From b88a70dcf44378c267972c238bf63f1c9012e18e Mon Sep 17 00:00:00 2001 From: rmgpanw Date: Mon, 22 Apr 2024 22:47:32 +0100 Subject: [PATCH] add `drop_na_cols` parameter to `format_sumstats()` and `check_miss_data()` --- R/check_miss_data.R | 14 +++++++++++-- R/format_sumstats.R | 10 +++++++++- R/validate_parameters.R | 10 ++++++++++ tests/testthat/test-missing_data.R | 32 ++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 3 deletions(-) diff --git a/R/check_miss_data.R b/R/check_miss_data.R index 30eefaf..b24bf61 100644 --- a/R/check_miss_data.R +++ b/R/check_miss_data.R @@ -7,7 +7,8 @@ #' @keywords internal #' @importFrom stats complete.cases check_miss_data <- function(sumstats_dt, path, log_folder_ind, check_save_out, - tabix_index, nThread, log_files) { + tabix_index, nThread, log_files, + drop_na_cols) { message("Checking for missing data.") col_headers <- names(sumstats_dt) # use data table for speed @@ -20,7 +21,16 @@ check_miss_data <- function(sumstats_dt, path, log_folder_ind, check_save_out, col_headers[grepl("^convert_", col_headers)], "SNP_INFO"["SNP_INFO"%in% col_headers] ) - incl_cols <- names(sumstats_dt)[!names(sumstats_dt) %in% ignore_cols] + + if (!is.null(drop_na_cols)) { + drop_na_cols_in_sumstats <- + c(drop_na_cols)[drop_na_cols %in% names(sumstats_dt)] + incl_cols <- + c(drop_na_cols_in_sumstats)[!drop_na_cols_in_sumstats %in% ignore_cols] + } else { + incl_cols <- names(sumstats_dt)[!names(sumstats_dt) %in% ignore_cols] + } + if (nrow(sumstats_dt[!complete.cases(sumstats_dt[, incl_cols, with = FALSE ]), ]) > 0) { diff --git a/R/format_sumstats.R b/R/format_sumstats.R index 69422a4..212e11f 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -221,6 +221,11 @@ #' give is incorrect you can supply your own mapping file. Must be a 2 column #' dataframe with column names "Uncorrected" and "Corrected". See #' data(sumstatsColHeaders) for default mapping and necessary format. +#' @param drop_na_cols A character vector of column names to be checked for missing values. +#' Rows with missing values in any of these columns (if present in the dataset) will be dropped. If `NULL`, +#' all columns will be checked for missing values. Default columns are SNP, +#' chromosome, position, allele 1, allele2, frequency, beta, standard error, p +#' value and N columns. #' #' @importFrom data.table fread #' @importFrom data.table fwrite @@ -284,6 +289,7 @@ format_sumstats <- function(path, imputation_ind = FALSE, force_new = FALSE, mapping_file = sumstatsColHeaders, + drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", "FRQ", "BETA", "SE", "P", "N"), #deprecated parameters rmv_chrPrefix = NULL ) { @@ -367,6 +373,7 @@ format_sumstats <- function(path, mapping_file = mapping_file, tabix_index = tabix_index, chain_source = chain_source, + drop_na_cols = drop_na_cols, #deprecated parameters rmv_chrPrefix = rmv_chrPrefix ) @@ -773,7 +780,8 @@ format_sumstats <- function(path, check_save_out = check_save_out, tabix_index = tabix_index, nThread = nThread, - log_files = log_files + log_files = log_files, + drop_na_cols = drop_na_cols ) # update values log_files <- sumstats_return$log_files diff --git a/R/validate_parameters.R b/R/validate_parameters.R index 2db30fb..b1637d5 100644 --- a/R/validate_parameters.R +++ b/R/validate_parameters.R @@ -46,6 +46,7 @@ validate_parameters <- function(path, mapping_file, tabix_index, chain_source, + drop_na_cols, #deprecated parameters rmv_chrPrefix) { # Checking if the file exists should happen first - @@ -406,6 +407,15 @@ validate_parameters <- function(path, stop(tbx_msg) } + # validate drop_na_cols + if (!is.character(drop_na_cols)) { + if (!is.null(drop_na_cols)) { + stop( + "Parameter `drop_na_cols` should be either a character vector of column names, or `NULL`" + ) + } + } + #deprecated parameters if (!is.null(rmv_chrPrefix)) { dep_msg <- paste0( diff --git a/tests/testthat/test-missing_data.R b/tests/testthat/test-missing_data.R index 1701331..771c159 100644 --- a/tests/testthat/test-missing_data.R +++ b/tests/testthat/test-missing_data.R @@ -60,6 +60,38 @@ test_that("Handle missing data", { ) reformatted_lines <- readLines(reformatted) expect_equal(reformatted_lines, org_lines) + + # set `drop_na_cols` to `NULL` + miss_extra_col <- miss + miss_extra_col$extra <- NA + + expect_error(MungeSumstats::format_sumstats( + miss_extra_col, + ref_genome = "GRCh37", + on_ref_genome = FALSE, + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + sort_coordinates = FALSE, + dbSNP = 144, + drop_na_cols = NULL + ), + regexp = "All SNPs have been filtered out of your summary statistics dataset") + + reformatted_extra_col <- MungeSumstats::format_sumstats( + miss_extra_col, + ref_genome = "GRCh37", + on_ref_genome = FALSE, + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + sort_coordinates = FALSE, + dbSNP = 144, + drop_na_cols = c("CHR", "POS") + ) + + reformatted_extra_col_lines <- readLines(reformatted_extra_col) + expect_equal(length(reformatted_extra_col_lines), length(org_lines)) } else{ expect_equal(is_32bit_windows, TRUE)