From 33891b066abba4154698245444a053ccd45f05bf Mon Sep 17 00:00:00 2001 From: Al-Murphy Date: Thu, 19 Sep 2024 13:33:29 +0100 Subject: [PATCH] infer effect column A0 & eff_on_minor_alleles param --- DESCRIPTION | 2 +- NEWS.md | 13 ++ R/format_sumstats.R | 11 +- R/get_eff_frq_allele_combns.R | 22 ++- R/get_genome_build.R | 3 +- R/infer_effect_column.R | 148 ++++++++++++++---- ...se_sumstats_column_headers_crossplatform.R | 8 +- R/validate_parameters.R | 4 + man/format_sumstats.Rd | 8 + man/get_genome_build.Rd | 3 +- man/import_sumstats.Rd | 6 + man/infer_effect_column.Rd | 35 +++-- man/standardise_header.Rd | 5 + man/validate_parameters.Rd | 8 + 14 files changed, 222 insertions(+), 54 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 37b72a6d..0101b700 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.13.6 +Version: 1.13.7 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NEWS.md b/NEWS.md index fb1e93e6..caae33e0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,16 @@ +## CHANGES IN VERSION 1.13.7 + +### Bug fix +* `infer_eff_direction` now includes A0 as an ambiguous case as well as A1/A2. + +### New features +* `eff_on_minor_alleles` parameter added (off by default) - controls whether +MungeSumstats should assume that the effects are majoritively measured on the +minor alleles. Default is FALSE as this is an assumption that won't be +appropriate in all cases. However, the benefit is that if we know the majority +of SNPs have their effects based on the minor alleles, we can catch cases where +the allele columns have been mislabelled. + ## CHANGES IN VERSION 1.13.6 ### New features diff --git a/R/format_sumstats.R b/R/format_sumstats.R index 1310bb78..6f42374b 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -134,6 +134,12 @@ #' the reference genome by SNP ID. Default is TRUE. #' @param infer_eff_direction Binary Should a check take place to ensure the #' alleles match the effect direction? Default is TRUE. +#' @param eff_on_minor_alleles Binary Should MungeSumstats assume that the +#' effects are majoritively measured on the minor alleles? Default is FALSE as +#' this is an assumption that won't be appropriate in all cases. However, the +#' benefit is that if we know the majority of SNPs have their effects based on +#' the minor alleles, we can catch cases where the allele columns have been +#' mislabelled. #' @param strand_ambig_filter Binary Should SNPs with strand-ambiguous alleles #' be removed. Default is FALSE. #' @param allele_flip_check Binary Should the allele columns be checked against @@ -267,6 +273,7 @@ format_sumstats <- function(path, rmv_chr = c("X", "Y", "MT"), on_ref_genome = TRUE, infer_eff_direction = TRUE, + eff_on_minor_alleles = FALSE, strand_ambig_filter = FALSE, allele_flip_check = TRUE, allele_flip_drop = TRUE, @@ -359,6 +366,7 @@ format_sumstats <- function(path, rmv_chr = rmv_chr, on_ref_genome = on_ref_genome, infer_eff_direction = infer_eff_direction, + eff_on_minor_alleles = eff_on_minor_alleles, strand_ambig_filter = strand_ambig_filter, allele_flip_check = allele_flip_check, allele_flip_drop = allele_flip_drop, @@ -496,7 +504,8 @@ format_sumstats <- function(path, nThread = nThread, ref_genome = ref_genome, on_ref_genome = on_ref_genome, - infer_eff_direction = infer_eff_direction + infer_eff_direction = infer_eff_direction, + eff_on_minor_alleles = eff_on_minor_alleles ) #### Check 3:Standardise headers for all OS #### diff --git a/R/get_eff_frq_allele_combns.R b/R/get_eff_frq_allele_combns.R index e6e93751..689a59a0 100644 --- a/R/get_eff_frq_allele_combns.R +++ b/R/get_eff_frq_allele_combns.R @@ -19,7 +19,7 @@ get_eff_frq_allele_combns <- mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED #join with all allele cols allele_uncorrc <- - mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED + mapping_file[mapping_file$CORRECTED %in% c('A1','A2','A*'),]$UNCORRECTED #get combinations eff_frq_allele_dt <- data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc, @@ -51,25 +51,35 @@ get_eff_frq_allele_combns <- eff_frq_allele_matches <- data.table::rbindlist(all_combns) #finally add some custom ones custom_adds <- data.table::data.table("UNCORRECTED" = - c("BETA1", "BETA2","AF1","AF2", + c("BETA1", "BETA2","BETA0", + "AF1","AF2","AF0", "FREQ.A1.1000G.EUR", "FREQ.A2.1000G.EUR", + "FREQ.A0.1000G.EUR", "FREQ.A1.ESP.EUR", "FREQ.A2.ESP.EUR", + "FREQ.A0.ESP.EUR", "FREQ.ALLELE1.HAPMAPCEU", "FREQ.ALLELE2.HAPMAPCEU", - "FREQ1","FREQ2", - "FREQ1.HAPMAP","FREQ2.HAPMAP"), + "FREQ.ALLELE0.HAPMAPCEU", + "FREQ1","FREQ2","FREQ0", + "FREQ1.HAPMAP","FREQ2.HAPMAP", + "FREQ0.HAPMAP"), "CORRECTED" = - c("BETA", "BETA","FRQ","FRQ", + c("BETA", "BETA","BETA", + "FRQ","FRQ","FRQ", "FRQ", "FRQ", "FRQ", "FRQ", "FRQ", "FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ","FRQ","FRQ", "FRQ","FRQ", - "FRQ","FRQ")) + "FRQ")) eff_frq_allele_matches <- data.table::rbindlist(list( eff_frq_allele_matches,custom_adds)) diff --git a/R/get_genome_build.R b/R/get_genome_build.R index e2b6a365..24dacf97 100644 --- a/R/get_genome_build.R +++ b/R/get_genome_build.R @@ -22,7 +22,8 @@ #' This should help speed up cases where you have to read in \code{sumstats} #' from disk each time. #' @param allele_match_ref Instead of returning the genome_build this will -#' return the propotion of matches to each genome build for each allele (A1,A2). +#' return the proportion of matches to each genome build for each allele +#' (A1,A2). #' @inheritParams format_sumstats #' @inheritParams get_genome_builds #' diff --git a/R/infer_effect_column.R b/R/infer_effect_column.R index 2529769e..d97fedd5 100644 --- a/R/infer_effect_column.R +++ b/R/infer_effect_column.R @@ -1,20 +1,28 @@ #' Infer if effect relates to a1 or A2 if ambiguously named #' #' Three checks are made to infer which allele the effect/frequency information -#' relates to if they are ambiguous (named A1 and A2 or equivalent): -#' 1. Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or +#' relates to if they are ambiguous (named A0, A1 and A2 or equivalent): +#' 1. Check if ambiguous naming conventions are used (i.e. allele 0, 1 and 2 or #' equivalent). If not exit, otherwise continue to next checks. This can be #' checked by using the mapping file and splitting A1/A2 mappings by those that -#' contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, +#' contain 0, 1 or 2 (ambiguous) or doesn't contain 0, 1 or 2 e.g. effect, #' tested (unambiguous so fine for MSS to handle as is). -#' 2. Look for effect column/frequency column where the A1/A2 explicitly -#' mentioned, if found then we know the direction and should update A1/A2 +#' 2. Look for effect column/frequency column where the A0/A1/A2 explicitly +#' mentioned, if found then we know the direction and should update A0/A1/A2 #' naming so A2 is the effect column. We can look for such columns by getting -#' every combination of A1/A2 naming and effect/frq naming. +#' every combination of A0/A1/A2 naming and effect/frq naming. #' 3. If not found in 2, a final check should be against the reference genome, -#' whichever of A1 and A2 has more of a match with the reference genome should -#' be taken as **not** the effect allele. There is an assumption in this but is -#' still better than guessing the ambiguous allele naming. +#' whichever of A0, A1 and A2 has more of a match with the reference genome +#' should be taken as **not** the effect allele. There is an assumption in this +#' but is still better than guessing the ambiguous allele naming. +#' +#' Also, if eff_on_minor_alleles=TRUE, check 3 will be used in all cases. +#' However, This assumes that the effects are majoritively measured on the +#' minor alleles and should be used with caution as this is an assumption that +#' won't be appropriate in all cases. However, the benefit is that if we know +#' the majority of SNPs have their effects based on the minor alleles, we can +#' catch cases where the allele columns have been mislabelled. IF +#' eff_on_minor_alleles=TRUE, checks 1 and 2 will be skipped. #' #' @inheritParams format_sumstats #' @inheritParams compute_nsize @@ -36,8 +44,9 @@ infer_effect_column <- ref_genome = NULL, on_ref_genome = TRUE, infer_eff_direction = TRUE, + eff_on_minor_alleles = FALSE, return_list=TRUE) { - if(isTRUE(infer_eff_direction)){ + if(isTRUE(infer_eff_direction)||isTRUE(eff_on_minor_alleles)){ message("Infer Effect Column") message("First line of summary statistics file: ") msg <- paste0(names(sumstats_dt), split = "\t") @@ -48,23 +57,29 @@ infer_effect_column <- # Identify allele mappings which are ambiguous and problematic # vs those that are interpretable colnames(mapping_file) <- toupper(colnames(mapping_file)) - allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),] + #A* is A0, A* used since usually if A0/A1 used, meaning of A1 + #becomes eff allele and A0 is non-eff so need to flip later + allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2', + 'A*'),] ambig_allele_map <- allele_mapping[grepl('1',allele_mapping$UNCORRECTED)| - grepl('2',allele_mapping$UNCORRECTED),] + grepl('2',allele_mapping$UNCORRECTED)| + grepl('0',allele_mapping$UNCORRECTED),] unambig_allele_map <- allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)| - grepl('2',allele_mapping$UNCORRECTED)),] + grepl('2',allele_mapping$UNCORRECTED)| + grepl('0',allele_mapping$UNCORRECTED)),] #as long as the sumstats contains 1 unambiguous allele column MSS will #work as expected unambig_cols <- intersect(unambig_allele_map$UNCORRECTED, toupper(column_headers)) ambig_cols <- intersect(ambig_allele_map$UNCORRECTED, toupper(column_headers)) - #if both ambiguous and unambiguous columns found, rename ambiguous ones so - #they aren't used later by MSS + #if both ambiguous and unambiguous columns found, rename ambiguous ones + #so they aren't used later by MSS #example: 'A1','A2','EFFECT_ALLELE' all present - if (length(unambig_cols)>0 && length(ambig_cols)>0){ + if (length(unambig_cols)>0 && length(ambig_cols)>0 && + isFALSE(eff_on_minor_alleles)){ #find if unambig and ambig relate to the same allele #get corrected name for unambig unambig_corrcted <- @@ -92,27 +107,53 @@ infer_effect_column <- paste0(chng_i,"_INPUTTED")) } } - } else if (length(unambig_cols)==0 && length(ambig_cols)>=2){ - #only continue if no unambiguous columns found but 2 ambig ones are found- - #less than 2 in total means allele info is missing which MSS can try fill - #in later - message("Allele columns are ambiguous, attempting to infer direction") - #get names for allele marked eff/frq columns - eff_frq_allele_matches <- get_eff_frq_allele_combns() - #now look for matches in sumstats - fnd_allele_indicator <- - column_headers[toupper(column_headers) %in% - eff_frq_allele_matches$UNCORRECTED] + } else if ((length(unambig_cols)==0 && length(ambig_cols)>=2) || + isTRUE(eff_on_minor_alleles)){ + #first case for ambig allelee where user didn't set eff_on_minor_alleles + if ((length(unambig_cols)==0 && length(ambig_cols)>=2) && + isFALSE(eff_on_minor_alleles)){ + #only continue if no unambiguous columns found but 2 ambig ones are + #found- less than 2 in total means allele info is missing which MSS + #can try fill in later + message("Allele columns are ambiguous, attempting to infer direction") + #get names for allele marked eff/frq columns + eff_frq_allele_matches <- get_eff_frq_allele_combns() + #now look for matches in sumstats + fnd_allele_indicator <- + column_headers[toupper(column_headers) %in% + eff_frq_allele_matches$UNCORRECTED] + } else{ + #for eff_on_minor_alleles = TRUE - + #force length(fnd_allele_indicator)>0 to return FALSE + fnd_allele_indicator<-c() + } if(length(fnd_allele_indicator)>0){ message("Found direction from effect/frq column naming") #fnd_allele_indicator could be >1 so majority vote a1_mtch <- sum(grepl("A1",fnd_allele_indicator)) a2_mtch <- sum(grepl("A2",fnd_allele_indicator)) - if(a2_mtch>=a1_mtch){ - message("Effect/frq column(s) relate to A2 in the sumstats") + a0_mtch <- sum(grepl("A0",fnd_allele_indicator)) + #need to also check if allele 0 & allele 1 found or more normal case + #of allele 1 & allele 2, as this flips which is interp as the eff + #allele by MSS + samp_dt <- copy(sumstats_dt[1:10]) + samp_dt <- + standardise_sumstats_column_headers_crossplatform(samp_dt, + mapping_file= + mapping_file, + convert_A0=FALSE, + return_list=FALSE) + formatted_col_headers <- names(samp_dt) + #check if A0,A1 and A2 present + a1_found <- "A1" %in% formatted_col_headers + a2_found <- "A2" %in% formatted_col_headers + a0_found <- "A*" %in% formatted_col_headers + #if A2 found at all, it is eff col normally in MSS + if(a2_mtch>=a1_mtch && a2_found){ + message("Effect/frq column(s) relate to A2 in the sumstat") #this is what MSS expects so no action required - }else{#a2_mtch=a0_mtch&& a0_found){ + message("Effect/frq column(s) relate to A1 where A0 in the sumstat") + #this is what MSS expects so no action required + }else if(a1_mtch ref/alt so A1 flips meaning, + # A0 is A* in mapping + # but usually A1/A2 -> ref/alt so if A* found, + # swap A1 to A2 and make A* -> A1 + new_headers <- colnames(sumstats_dt) + if ("A*" %in% new_headers) { + # if A1 and A2 also present need to rename A2 + if ("A1" %in% new_headers && "A2" %in% new_headers) { + data.table::setnames(sumstats_dt, "A2", "A2_from_input") + } + # if A1 present change to A2, doesn't have to be, + # can be imputted + data.table::setnames(sumstats_dt, "A1", "A2", + skip_absent = TRUE) + data.table::setnames(sumstats_dt, "A*", "A1") + } #now switch data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_") data.table::setnames(sumstats_dt,"A1","A2") diff --git a/R/standardise_sumstats_column_headers_crossplatform.R b/R/standardise_sumstats_column_headers_crossplatform.R index 4ecc9893..e978cd6c 100644 --- a/R/standardise_sumstats_column_headers_crossplatform.R +++ b/R/standardise_sumstats_column_headers_crossplatform.R @@ -9,6 +9,9 @@ #' @param uppercase_unmapped For columns that could not be identified in #' the \code{mapping_file}, return them in the same format they were input as #' (without forcing them to uppercase). +#' @param convert_A0 Whether to convert A* (representing A0) to A1/A2. This +#' should be done unless checking if A0 was present in the input as if you do +#' it you can't infer this. Default is TRUE #' @inheritParams format_sumstats #' @inheritParams compute_nsize #' @return list containing sumstats_dt, the modified summary statistics data @@ -23,6 +26,7 @@ standardise_header <- standardise_sumstats_column_headers_crossplatform <- function(sumstats_dt, mapping_file = sumstatsColHeaders, uppercase_unmapped=TRUE, + convert_A0 = TRUE, return_list=TRUE) { message("Standardising column headers.") message("First line of summary statistics file: ") @@ -56,12 +60,12 @@ standardise_header <- standardise_sumstats_column_headers_crossplatform <- # but usually A1/A2 -> ref/alt so if A* found, # swap A1 to A2 and make A* -> A1 new_headers <- colnames(sumstats_dt) - if ("A*" %in% new_headers) { + if ("A*" %in% new_headers && isTRUE(convert_A0)) { # if A1 and A2 also present need to rename A2 if ("A1" %in% new_headers && "A2" %in% new_headers) { data.table::setnames(sumstats_dt, "A2", "A2_from_input") } - # if A1 present change to A2, doesn't have to be, can be inputted + # if A1 present change to A2, doesn't have to be, can be imputted data.table::setnames(sumstats_dt, "A1", "A2", skip_absent = TRUE) data.table::setnames(sumstats_dt, "A*", "A1") } diff --git a/R/validate_parameters.R b/R/validate_parameters.R index a81b52a6..2890e653 100644 --- a/R/validate_parameters.R +++ b/R/validate_parameters.R @@ -22,6 +22,7 @@ validate_parameters <- function(path, rmv_chr, on_ref_genome, infer_eff_direction, + eff_on_minor_alleles, strand_ambig_filter, allele_flip_check, allele_flip_drop, @@ -239,6 +240,9 @@ validate_parameters <- function(path, if(!is.logical(infer_eff_direction)){ stop("infer_eff_direction must be either TRUE or FALSE") } + if(!is.logical(eff_on_minor_alleles)){ + stop("eff_on_minor_alleles must be either TRUE or FALSE") + } if (!is.logical(strand_ambig_filter)) { stop("strand_ambig_filter must be either TRUE or FALSE") } diff --git a/man/format_sumstats.Rd b/man/format_sumstats.Rd index d8c9cce1..24905339 100644 --- a/man/format_sumstats.Rd +++ b/man/format_sumstats.Rd @@ -32,6 +32,7 @@ format_sumstats( rmv_chr = c("X", "Y", "MT"), on_ref_genome = TRUE, infer_eff_direction = TRUE, + eff_on_minor_alleles = FALSE, strand_ambig_filter = FALSE, allele_flip_check = TRUE, allele_flip_drop = TRUE, @@ -193,6 +194,13 @@ the reference genome by SNP ID. Default is TRUE.} \item{infer_eff_direction}{Binary Should a check take place to ensure the alleles match the effect direction? Default is TRUE.} +\item{eff_on_minor_alleles}{Binary Should MungeSumstats assume that the +effects are majoritively measured on the minor alleles? Default is FALSE as +this is an assumption that won't be appropriate in all cases. However, the +benefit is that if we know the majority of SNPs have their effects based on +the minor alleles, we can catch cases where the allele columns have been +mislabelled.} + \item{strand_ambig_filter}{Binary Should SNPs with strand-ambiguous alleles be removed. Default is FALSE.} diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd index abd188ab..c3afd570 100644 --- a/man/get_genome_build.Rd +++ b/man/get_genome_build.Rd @@ -46,7 +46,8 @@ This should help speed up cases where you have to read in \code{sumstats} from disk each time.} \item{allele_match_ref}{Instead of returning the genome_build this will -return the propotion of matches to each genome build for each allele (A1,A2).} +return the proportion of matches to each genome build for each allele +(A1,A2).} \item{ref_genome}{name of the reference genome used for the GWAS ("GRCh37" or "GRCh38"). Argument is case-insensitive. Default is NULL which infers the diff --git a/man/import_sumstats.Rd b/man/import_sumstats.Rd index 566adf95..88229dc5 100644 --- a/man/import_sumstats.Rd +++ b/man/import_sumstats.Rd @@ -158,6 +158,12 @@ which removes all non-autosomal SNPs.} the reference genome by SNP ID. Default is TRUE.} \item{\code{infer_eff_direction}}{Binary Should a check take place to ensure the alleles match the effect direction? Default is TRUE.} + \item{\code{eff_on_minor_alleles}}{Binary Should MungeSumstats assume that the +effects are majoritively measured on the minor alleles? Default is FALSE as +this is an assumption that won't be appropriate in all cases. However, the +benefit is that if we know the majority of SNPs have their effects based on +the minor alleles, we can catch cases where the allele columns have been +mislabelled.} \item{\code{strand_ambig_filter}}{Binary Should SNPs with strand-ambiguous alleles be removed. Default is FALSE.} \item{\code{allele_flip_check}}{Binary Should the allele columns be checked against diff --git a/man/infer_effect_column.Rd b/man/infer_effect_column.Rd index 5cd1444e..9aa33c84 100644 --- a/man/infer_effect_column.Rd +++ b/man/infer_effect_column.Rd @@ -13,6 +13,7 @@ infer_effect_column( ref_genome = NULL, on_ref_genome = TRUE, infer_eff_direction = TRUE, + eff_on_minor_alleles = FALSE, return_list = TRUE ) } @@ -44,6 +45,13 @@ the reference genome by SNP ID. Default is TRUE.} \item{infer_eff_direction}{Binary Should a check take place to ensure the alleles match the effect direction? Default is TRUE.} +\item{eff_on_minor_alleles}{Binary Should MungeSumstats assume that the +effects are majoritively measured on the minor alleles? Default is FALSE as +this is an assumption that won't be appropriate in all cases. However, the +benefit is that if we know the majority of SNPs have their effects based on +the minor alleles, we can catch cases where the allele columns have been +mislabelled.} + \item{return_list}{Return the \code{sumstats_dt} within a named list (default: \code{TRUE}).} } @@ -53,22 +61,31 @@ table object } \description{ Three checks are made to infer which allele the effect/frequency information -relates to if they are ambiguous (named A1 and A2 or equivalent): +relates to if they are ambiguous (named A0, A1 and A2 or equivalent): \enumerate{ -\item Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or +\item Check if ambiguous naming conventions are used (i.e. allele 0, 1 and 2 or equivalent). If not exit, otherwise continue to next checks. This can be checked by using the mapping file and splitting A1/A2 mappings by those that -contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, +contain 0, 1 or 2 (ambiguous) or doesn't contain 0, 1 or 2 e.g. effect, tested (unambiguous so fine for MSS to handle as is). -\item Look for effect column/frequency column where the A1/A2 explicitly -mentioned, if found then we know the direction and should update A1/A2 +\item Look for effect column/frequency column where the A0/A1/A2 explicitly +mentioned, if found then we know the direction and should update A0/A1/A2 naming so A2 is the effect column. We can look for such columns by getting -every combination of A1/A2 naming and effect/frq naming. +every combination of A0/A1/A2 naming and effect/frq naming. \item If not found in 2, a final check should be against the reference genome, -whichever of A1 and A2 has more of a match with the reference genome should -be taken as \strong{not} the effect allele. There is an assumption in this but is -still better than guessing the ambiguous allele naming. +whichever of A0, A1 and A2 has more of a match with the reference genome +should be taken as \strong{not} the effect allele. There is an assumption in this +but is still better than guessing the ambiguous allele naming. +} } +\details{ +Also, if eff_on_minor_alleles=TRUE, check 3 will be used in all cases. +However, This assumes that the effects are majoritively measured on the +minor alleles and should be used with caution as this is an assumption that +won't be appropriate in all cases. However, the benefit is that if we know +the majority of SNPs have their effects based on the minor alleles, we can +catch cases where the allele columns have been mislabelled. IF +eff_on_minor_alleles=TRUE, checks 1 and 2 will be skipped. } \examples{ sumstats <- MungeSumstats::formatted_example() diff --git a/man/standardise_header.Rd b/man/standardise_header.Rd index fd18b1a9..20b6100f 100644 --- a/man/standardise_header.Rd +++ b/man/standardise_header.Rd @@ -9,6 +9,7 @@ standardise_header( sumstats_dt, mapping_file = sumstatsColHeaders, uppercase_unmapped = TRUE, + convert_A0 = TRUE, return_list = TRUE ) } @@ -27,6 +28,10 @@ data(sumstatsColHeaders) for default mapping and necessary format.} the \code{mapping_file}, return them in the same format they were input as (without forcing them to uppercase).} +\item{convert_A0}{Whether to convert A* (representing A0) to A1/A2. This +should be done unless checking if A0 was present in the input as if you do +it you can't infer this. Default is TRUE} + \item{return_list}{Return the \code{sumstats_dt} within a named list (default: \code{TRUE}).} } diff --git a/man/validate_parameters.Rd b/man/validate_parameters.Rd index af6bd25a..b57ab0e0 100644 --- a/man/validate_parameters.Rd +++ b/man/validate_parameters.Rd @@ -24,6 +24,7 @@ validate_parameters( rmv_chr, on_ref_genome, infer_eff_direction, + eff_on_minor_alleles, strand_ambig_filter, allele_flip_check, allele_flip_drop, @@ -134,6 +135,13 @@ the reference genome by SNP ID. Default is TRUE.} \item{infer_eff_direction}{Binary Should a check take place to ensure the alleles match the effect direction? Default is TRUE.} +\item{eff_on_minor_alleles}{Binary Should MungeSumstats assume that the +effects are majoritively measured on the minor alleles? Default is FALSE as +this is an assumption that won't be appropriate in all cases. However, the +benefit is that if we know the majority of SNPs have their effects based on +the minor alleles, we can catch cases where the allele columns have been +mislabelled.} + \item{strand_ambig_filter}{Binary Should SNPs with strand-ambiguous alleles be removed. Default is FALSE.}