From 33891b066abba4154698245444a053ccd45f05bf Mon Sep 17 00:00:00 2001
From: Al-Murphy <alanmurph94@hotmail.com>
Date: Thu, 19 Sep 2024 13:33:29 +0100
Subject: [PATCH] infer effect column A0 & eff_on_minor_alleles param

---
 DESCRIPTION                                   |   2 +-
 NEWS.md                                       |  13 ++
 R/format_sumstats.R                           |  11 +-
 R/get_eff_frq_allele_combns.R                 |  22 ++-
 R/get_genome_build.R                          |   3 +-
 R/infer_effect_column.R                       | 148 ++++++++++++++----
 ...se_sumstats_column_headers_crossplatform.R |   8 +-
 R/validate_parameters.R                       |   4 +
 man/format_sumstats.Rd                        |   8 +
 man/get_genome_build.Rd                       |   3 +-
 man/import_sumstats.Rd                        |   6 +
 man/infer_effect_column.Rd                    |  35 +++--
 man/standardise_header.Rd                     |   5 +
 man/validate_parameters.Rd                    |   8 +
 14 files changed, 222 insertions(+), 54 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 37b72a6d..0101b700 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MungeSumstats
 Type: Package
 Title: Standardise summary statistics from GWAS
-Version: 1.13.6
+Version: 1.13.7
 Authors@R:
     c(person(given = "Alan",
            family = "Murphy",
diff --git a/NEWS.md b/NEWS.md
index fb1e93e6..caae33e0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,16 @@
+## CHANGES IN VERSION 1.13.7
+
+### Bug fix
+* `infer_eff_direction` now includes A0 as an ambiguous case as well as A1/A2.
+
+### New features
+* `eff_on_minor_alleles` parameter added (off by default) - controls whether 
+MungeSumstats should assume that the effects are majoritively measured on the 
+minor alleles. Default is FALSE as this is an assumption that won't be 
+appropriate in all cases. However, the benefit is that if we know the majority 
+of SNPs have their effects based on the minor alleles, we can catch cases where 
+the allele columns have been mislabelled. 
+
 ## CHANGES IN VERSION 1.13.6
 
 ### New features
diff --git a/R/format_sumstats.R b/R/format_sumstats.R
index 1310bb78..6f42374b 100644
--- a/R/format_sumstats.R
+++ b/R/format_sumstats.R
@@ -134,6 +134,12 @@
 #' the reference genome by SNP ID. Default is TRUE.
 #' @param infer_eff_direction Binary Should a check take place to ensure the 
 #' alleles match the effect direction? Default is TRUE.
+#' @param eff_on_minor_alleles Binary Should MungeSumstats assume that the 
+#' effects are majoritively measured on the minor alleles? Default is FALSE as
+#' this is an assumption that won't be appropriate in all cases. However, the 
+#' benefit is that if we know the majority of SNPs have their effects based on 
+#' the minor alleles, we can catch cases where the allele columns have been 
+#' mislabelled.
 #' @param strand_ambig_filter Binary Should SNPs with strand-ambiguous alleles
 #' be removed. Default is FALSE.
 #' @param allele_flip_check Binary Should the allele columns be checked against
@@ -267,6 +273,7 @@ format_sumstats <- function(path,
                             rmv_chr = c("X", "Y", "MT"),
                             on_ref_genome = TRUE,
                             infer_eff_direction = TRUE,
+                            eff_on_minor_alleles = FALSE,
                             strand_ambig_filter = FALSE,
                             allele_flip_check = TRUE,
                             allele_flip_drop = TRUE,
@@ -359,6 +366,7 @@ format_sumstats <- function(path,
             rmv_chr = rmv_chr,
             on_ref_genome = on_ref_genome,
             infer_eff_direction = infer_eff_direction,
+            eff_on_minor_alleles = eff_on_minor_alleles,
             strand_ambig_filter = strand_ambig_filter,
             allele_flip_check = allele_flip_check,
             allele_flip_drop = allele_flip_drop,
@@ -496,7 +504,8 @@ format_sumstats <- function(path,
             nThread = nThread,
             ref_genome = ref_genome,
             on_ref_genome = on_ref_genome,
-            infer_eff_direction = infer_eff_direction
+            infer_eff_direction = infer_eff_direction,
+            eff_on_minor_alleles = eff_on_minor_alleles
           )
         
         #### Check 3:Standardise headers for all OS ####
diff --git a/R/get_eff_frq_allele_combns.R b/R/get_eff_frq_allele_combns.R
index e6e93751..689a59a0 100644
--- a/R/get_eff_frq_allele_combns.R
+++ b/R/get_eff_frq_allele_combns.R
@@ -19,7 +19,7 @@ get_eff_frq_allele_combns <-
       mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED
     #join with all allele cols
     allele_uncorrc <- 
-      mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED
+      mapping_file[mapping_file$CORRECTED %in% c('A1','A2','A*'),]$UNCORRECTED
     #get combinations
     eff_frq_allele_dt <- 
       data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc, 
@@ -51,25 +51,35 @@ get_eff_frq_allele_combns <-
     eff_frq_allele_matches <- data.table::rbindlist(all_combns)
     #finally add some custom ones 
     custom_adds <- data.table::data.table("UNCORRECTED" = 
-                                            c("BETA1", "BETA2","AF1","AF2",
+                                            c("BETA1", "BETA2","BETA0",
+                                              "AF1","AF2","AF0",
                                               "FREQ.A1.1000G.EUR",
                                               "FREQ.A2.1000G.EUR",
+                                              "FREQ.A0.1000G.EUR",
                                               "FREQ.A1.ESP.EUR",
                                               "FREQ.A2.ESP.EUR",
+                                              "FREQ.A0.ESP.EUR",
                                               "FREQ.ALLELE1.HAPMAPCEU",
                                               "FREQ.ALLELE2.HAPMAPCEU",
-                                              "FREQ1","FREQ2", 
-                                              "FREQ1.HAPMAP","FREQ2.HAPMAP"),
+                                              "FREQ.ALLELE0.HAPMAPCEU",
+                                              "FREQ1","FREQ2","FREQ0", 
+                                              "FREQ1.HAPMAP","FREQ2.HAPMAP",
+                                              "FREQ0.HAPMAP"),
                                           "CORRECTED" = 
-                                            c("BETA", "BETA","FRQ","FRQ",
+                                            c("BETA", "BETA","BETA",
+                                              "FRQ","FRQ","FRQ",
                                               "FRQ",
                                               "FRQ",
                                               "FRQ",
                                               "FRQ",
                                               "FRQ",
                                               "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ","FRQ","FRQ",
                                               "FRQ","FRQ",
-                                              "FRQ","FRQ"))
+                                              "FRQ"))
     eff_frq_allele_matches <- data.table::rbindlist(list(
       eff_frq_allele_matches,custom_adds))
     
diff --git a/R/get_genome_build.R b/R/get_genome_build.R
index e2b6a365..24dacf97 100644
--- a/R/get_genome_build.R
+++ b/R/get_genome_build.R
@@ -22,7 +22,8 @@
 #' This should help speed up cases where you have to read in \code{sumstats}
 #' from disk each time.
 #' @param allele_match_ref Instead of returning the genome_build this will 
-#' return the propotion of matches to each genome build for each allele (A1,A2).
+#' return the proportion of matches to each genome build for each allele 
+#' (A1,A2).
 #' @inheritParams format_sumstats
 #' @inheritParams get_genome_builds 
 #' 
diff --git a/R/infer_effect_column.R b/R/infer_effect_column.R
index 2529769e..d97fedd5 100644
--- a/R/infer_effect_column.R
+++ b/R/infer_effect_column.R
@@ -1,20 +1,28 @@
 #' Infer if effect relates to a1 or A2 if ambiguously named
 #'
 #' Three checks are made to infer which allele the effect/frequency information
-#' relates to if they are ambiguous (named A1 and A2 or equivalent):
-#' 1. Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or 
+#' relates to if they are ambiguous (named A0, A1 and A2 or equivalent):
+#' 1. Check if ambiguous naming conventions are used (i.e. allele 0, 1 and 2 or 
 #' equivalent). If not exit, otherwise continue to next checks. This can be 
 #' checked by using the mapping file and splitting A1/A2 mappings by those that 
-#' contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, 
+#' contain 0, 1 or 2 (ambiguous) or doesn't contain 0, 1 or 2 e.g. effect, 
 #' tested (unambiguous so fine for MSS to handle as is).
-#' 2. Look for effect column/frequency column where the A1/A2 explicitly 
-#' mentioned, if found then  we know the direction and should update A1/A2 
+#' 2. Look for effect column/frequency column where the A0/A1/A2 explicitly 
+#' mentioned, if found then  we know the direction and should update A0/A1/A2 
 #' naming so A2 is the effect column. We can look for such columns by getting 
-#' every combination of A1/A2 naming and effect/frq naming.
+#' every combination of A0/A1/A2 naming and effect/frq naming.
 #' 3. If not found in 2, a final check should be against the reference genome, 
-#' whichever of A1 and A2 has more of a match with the reference genome should 
-#' be taken as **not** the effect allele. There is an assumption in this but is 
-#' still better than guessing the ambiguous allele naming.
+#' whichever of A0, A1 and A2 has more of a match with the reference genome 
+#' should be taken as **not** the effect allele. There is an assumption in this 
+#' but is still better than guessing the ambiguous allele naming.
+#' 
+#' Also, if eff_on_minor_alleles=TRUE, check 3 will be used in all cases. 
+#' However, This assumes that the effects are majoritively measured on the 
+#' minor alleles and should be used with caution as this is an assumption that 
+#' won't be appropriate in all cases. However, the benefit is that if we know 
+#' the majority of SNPs have their effects based on the minor alleles, we can 
+#' catch cases where the allele columns have been mislabelled. IF 
+#' eff_on_minor_alleles=TRUE, checks 1 and 2 will be skipped.
 #'
 #' @inheritParams format_sumstats 
 #' @inheritParams compute_nsize
@@ -36,8 +44,9 @@ infer_effect_column <-
            ref_genome = NULL,
            on_ref_genome = TRUE,
            infer_eff_direction = TRUE,
+           eff_on_minor_alleles = FALSE,
            return_list=TRUE) {
-    if(isTRUE(infer_eff_direction)){
+    if(isTRUE(infer_eff_direction)||isTRUE(eff_on_minor_alleles)){
       message("Infer Effect Column")
       message("First line of summary statistics file: ")
       msg <- paste0(names(sumstats_dt), split = "\t")
@@ -48,23 +57,29 @@ infer_effect_column <-
       # Identify allele mappings which are ambiguous and problematic
       # vs those that are interpretable
       colnames(mapping_file) <- toupper(colnames(mapping_file))
-      allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]
+      #A* is A0, A* used since usually if A0/A1 used, meaning of A1 
+      #becomes eff allele and A0 is non-eff so need to flip later
+      allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2',
+                                                                   'A*'),]
       ambig_allele_map <- 
         allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
-                         grepl('2',allele_mapping$UNCORRECTED),]
+                         grepl('2',allele_mapping$UNCORRECTED)|
+                         grepl('0',allele_mapping$UNCORRECTED),]
       unambig_allele_map <- 
         allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)|
-                           grepl('2',allele_mapping$UNCORRECTED)),]
+                           grepl('2',allele_mapping$UNCORRECTED)|
+                           grepl('0',allele_mapping$UNCORRECTED)),]
       #as long as the sumstats contains 1 unambiguous allele column MSS will
       #work as expected
       unambig_cols <- intersect(unambig_allele_map$UNCORRECTED,
                                 toupper(column_headers))
       ambig_cols <- intersect(ambig_allele_map$UNCORRECTED,
                               toupper(column_headers))
-      #if both ambiguous and unambiguous columns found, rename ambiguous ones so
-      #they aren't used later by MSS
+      #if both ambiguous and unambiguous columns found, rename ambiguous ones
+      #so they aren't used later by MSS
       #example: 'A1','A2','EFFECT_ALLELE' all present
-      if (length(unambig_cols)>0 && length(ambig_cols)>0){
+      if (length(unambig_cols)>0 && length(ambig_cols)>0 && 
+          isFALSE(eff_on_minor_alleles)){
         #find if unambig and ambig relate to the same allele
         #get corrected name for unambig 
         unambig_corrcted <- 
@@ -92,27 +107,53 @@ infer_effect_column <-
                                  paste0(chng_i,"_INPUTTED"))
           }
         }
-      } else if (length(unambig_cols)==0 && length(ambig_cols)>=2){
-        #only continue if no unambiguous columns found but 2 ambig ones are found-
-        #less than 2 in total means allele info is missing which MSS can try fill
-        #in later
-        message("Allele columns are ambiguous, attempting to infer direction")
-        #get names for allele marked eff/frq columns
-        eff_frq_allele_matches <- get_eff_frq_allele_combns()
-        #now look for matches in sumstats
-        fnd_allele_indicator <- 
-          column_headers[toupper(column_headers) %in% 
-                           eff_frq_allele_matches$UNCORRECTED]
+      } else if ((length(unambig_cols)==0 && length(ambig_cols)>=2) || 
+                 isTRUE(eff_on_minor_alleles)){
+        #first case for ambig allelee where user didn't set eff_on_minor_alleles
+        if ((length(unambig_cols)==0 && length(ambig_cols)>=2) && 
+            isFALSE(eff_on_minor_alleles)){
+          #only continue if no unambiguous columns found but 2 ambig ones are 
+          #found- less than 2 in total means allele info is missing which MSS 
+          #can try fill in later
+          message("Allele columns are ambiguous, attempting to infer direction")
+          #get names for allele marked eff/frq columns
+          eff_frq_allele_matches <- get_eff_frq_allele_combns()
+          #now look for matches in sumstats
+          fnd_allele_indicator <- 
+            column_headers[toupper(column_headers) %in% 
+                             eff_frq_allele_matches$UNCORRECTED]
+        } else{
+          #for eff_on_minor_alleles = TRUE - 
+          #force length(fnd_allele_indicator)>0 to return FALSE
+          fnd_allele_indicator<-c()
+        }
         if(length(fnd_allele_indicator)>0){
           message("Found direction from effect/frq column naming")
           #fnd_allele_indicator could be >1 so majority vote
           a1_mtch <- sum(grepl("A1",fnd_allele_indicator))
           a2_mtch <- sum(grepl("A2",fnd_allele_indicator))
-          if(a2_mtch>=a1_mtch){
-            message("Effect/frq column(s) relate to A2 in the sumstats")
+          a0_mtch <- sum(grepl("A0",fnd_allele_indicator))
+          #need to also check if allele 0 & allele 1 found or more normal case
+          #of allele 1 & allele 2, as this flips which is interp as the eff 
+          #allele by MSS
+          samp_dt <- copy(sumstats_dt[1:10])
+          samp_dt <- 
+            standardise_sumstats_column_headers_crossplatform(samp_dt,
+                                                              mapping_file=
+                                                                mapping_file,
+                                                              convert_A0=FALSE,
+                                                              return_list=FALSE)
+          formatted_col_headers <- names(samp_dt)
+          #check if A0,A1 and A2 present
+          a1_found <- "A1" %in% formatted_col_headers
+          a2_found <- "A2" %in% formatted_col_headers
+          a0_found <- "A*" %in% formatted_col_headers
+          #if A2 found at all, it is eff col normally in MSS
+          if(a2_mtch>=a1_mtch && a2_found){
+            message("Effect/frq column(s) relate to A2 in the sumstat")
             #this is what MSS expects so no action required
-          }else{#a2_mtch<a1_mtch
-            message("Effect/frq column(s) relate to A1 in the sumstats")
+          }else if(a2_mtch<a1_mtch && a2_found){
+            message("Effect/frq column(s) relate to A1 in the sumstat")
             #this is the opposite to what MSS expects so switch A1/A2 naming
             #first get corrected names for allele columns then switch
             for (headerI in seq_len(nrow(mapping_file))) {
@@ -127,18 +168,43 @@ infer_effect_column <-
             data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
             data.table::setnames(sumstats_dt,"A1","A2")
             data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
+          }else if(a1_mtch>=a0_mtch&& a0_found){
+            message("Effect/frq column(s) relate to A1 where A0 in the sumstat")
+            #this is what MSS expects so no action required
+          }else if(a1_mtch<a0_mtch&& a0_found){
+            message("Effect/frq column(s) relate to A0 in the sumstat")
+            #this is the opposite to what MSS expects so switch A1/A0 naming
+            #first get corrected names for allele columns then switch
+            for (headerI in seq_len(nrow(mapping_file))) {
+              un <- mapping_file[headerI, "UNCORRECTED"]
+              cr <- mapping_file[headerI, "CORRECTED"]
+              #note ambig_cols here not all col headers
+              if (un %in% ambig_cols & (!cr %in% column_headers)) {
+                data.table::setnames(sumstats_dt, un, cr)
+              }
+            }
+            #now switch
+            data.table::setnames(sumstats_dt,"A*","A0_INPUTTED_OLD_")
+            #don't flip A1 as the next step of the mapping is to flip it anyway
+            data.table::setnames(sumstats_dt,"A0_INPUTTED_OLD_","A2")
+          }else{#unknown
+            print("ERROR: Unknown condition, not changing allele mapping.")
           }
           
         }
         else{
-          #didn't find any allele specific
+          #Either - didn't find any allele specific or set eff_on_minor_alleles
           #last option is to check the reference genome and take the allele with
           #more matches to it as the non-effect allele
           #need to standardise the sumstats for this
           #use get_genome_build function for this
+          if(isFALSE(on_ref_genome) && isTRUE(eff_on_minor_alleles)){
+            stop("on_ref_genome must equal TRUE to use eff_on_minor_alleles")
+          }
           if(isTRUE(on_ref_genome)){
+            #Note this check will also work for when A0 is present
             switch_req <- get_genome_build(
-              sumstats = sumstats_dt,
+              sumstats = copy(sumstats_dt),
               standardise_headers = TRUE,
               sampled_snps = sampled_snps,
               mapping_file = mapping_file,
@@ -163,6 +229,22 @@ infer_effect_column <-
                     data.table::setnames(sumstats_dt, un, cr)
                   }
                 }
+                # Special case!! A0/A1 -> ref/alt so A1 flips meaning,
+                # A0 is A* in mapping
+                # but usually A1/A2 -> ref/alt so if A* found,
+                # swap A1 to A2 and make A* -> A1
+                new_headers <- colnames(sumstats_dt)
+                if ("A*" %in% new_headers) {
+                  # if A1 and A2 also present need to rename A2
+                  if ("A1" %in% new_headers && "A2" %in% new_headers) {
+                    data.table::setnames(sumstats_dt, "A2", "A2_from_input")
+                  }
+                  # if A1 present change to A2, doesn't have to be, 
+                  # can be imputted
+                  data.table::setnames(sumstats_dt, "A1", "A2", 
+                                       skip_absent = TRUE)
+                  data.table::setnames(sumstats_dt, "A*", "A1")
+                }
                 #now switch
                 data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
                 data.table::setnames(sumstats_dt,"A1","A2")
diff --git a/R/standardise_sumstats_column_headers_crossplatform.R b/R/standardise_sumstats_column_headers_crossplatform.R
index 4ecc9893..e978cd6c 100644
--- a/R/standardise_sumstats_column_headers_crossplatform.R
+++ b/R/standardise_sumstats_column_headers_crossplatform.R
@@ -9,6 +9,9 @@
 #' @param uppercase_unmapped For columns that could not be identified in 
 #' the \code{mapping_file}, return them in the same format they were input as
 #'  (without forcing them to uppercase). 
+#' @param convert_A0 Whether to convert A* (representing A0) to A1/A2. This 
+#' should be done unless checking if A0 was present in the input as if you do 
+#' it you can't infer this. Default is TRUE   
 #' @inheritParams format_sumstats 
 #' @inheritParams compute_nsize
 #' @return list containing sumstats_dt, the modified summary statistics data
@@ -23,6 +26,7 @@ standardise_header <- standardise_sumstats_column_headers_crossplatform <-
     function(sumstats_dt,
              mapping_file = sumstatsColHeaders,
              uppercase_unmapped=TRUE,
+             convert_A0 = TRUE,
              return_list=TRUE) {
         message("Standardising column headers.")
         message("First line of summary statistics file: ")
@@ -56,12 +60,12 @@ standardise_header <- standardise_sumstats_column_headers_crossplatform <-
         # but usually A1/A2 -> ref/alt so if A* found,
         # swap A1 to A2 and make A* -> A1
         new_headers <- colnames(sumstats_dt)
-        if ("A*" %in% new_headers) {
+        if ("A*" %in% new_headers && isTRUE(convert_A0)) {
             # if A1 and A2 also present need to rename A2
             if ("A1" %in% new_headers && "A2" %in% new_headers) {
                 data.table::setnames(sumstats_dt, "A2", "A2_from_input")
             }
-            # if A1 present change to A2, doesn't have to be, can be inputted
+            # if A1 present change to A2, doesn't have to be, can be imputted
             data.table::setnames(sumstats_dt, "A1", "A2", skip_absent = TRUE)
             data.table::setnames(sumstats_dt, "A*", "A1")
         }
diff --git a/R/validate_parameters.R b/R/validate_parameters.R
index a81b52a6..2890e653 100644
--- a/R/validate_parameters.R
+++ b/R/validate_parameters.R
@@ -22,6 +22,7 @@ validate_parameters <- function(path,
                                 rmv_chr,
                                 on_ref_genome,
                                 infer_eff_direction,
+                                eff_on_minor_alleles,
                                 strand_ambig_filter,
                                 allele_flip_check,
                                 allele_flip_drop,
@@ -239,6 +240,9 @@ validate_parameters <- function(path,
     if(!is.logical(infer_eff_direction)){
       stop("infer_eff_direction must be either TRUE or FALSE")
     }
+    if(!is.logical(eff_on_minor_alleles)){
+      stop("eff_on_minor_alleles must be either TRUE or FALSE")
+    }
     if (!is.logical(strand_ambig_filter)) {
         stop("strand_ambig_filter must be either TRUE or FALSE")
     }
diff --git a/man/format_sumstats.Rd b/man/format_sumstats.Rd
index d8c9cce1..24905339 100644
--- a/man/format_sumstats.Rd
+++ b/man/format_sumstats.Rd
@@ -32,6 +32,7 @@ format_sumstats(
   rmv_chr = c("X", "Y", "MT"),
   on_ref_genome = TRUE,
   infer_eff_direction = TRUE,
+  eff_on_minor_alleles = FALSE,
   strand_ambig_filter = FALSE,
   allele_flip_check = TRUE,
   allele_flip_drop = TRUE,
@@ -193,6 +194,13 @@ the reference genome by SNP ID. Default is TRUE.}
 \item{infer_eff_direction}{Binary Should a check take place to ensure the
 alleles match the effect direction? Default is TRUE.}
 
+\item{eff_on_minor_alleles}{Binary Should MungeSumstats assume that the
+effects are majoritively measured on the minor alleles? Default is FALSE as
+this is an assumption that won't be appropriate in all cases. However, the
+benefit is that if we know the majority of SNPs have their effects based on
+the minor alleles, we can catch cases where the allele columns have been
+mislabelled.}
+
 \item{strand_ambig_filter}{Binary Should SNPs with strand-ambiguous alleles
 be removed. Default is FALSE.}
 
diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd
index abd188ab..c3afd570 100644
--- a/man/get_genome_build.Rd
+++ b/man/get_genome_build.Rd
@@ -46,7 +46,8 @@ This should help speed up cases where you have to read in \code{sumstats}
 from disk each time.}
 
 \item{allele_match_ref}{Instead of returning the genome_build this will
-return the propotion of matches to each genome build for each allele (A1,A2).}
+return the proportion of matches to each genome build for each allele
+(A1,A2).}
 
 \item{ref_genome}{name of the reference genome used for the GWAS ("GRCh37" or
 "GRCh38"). Argument is case-insensitive. Default is NULL which infers the
diff --git a/man/import_sumstats.Rd b/man/import_sumstats.Rd
index 566adf95..88229dc5 100644
--- a/man/import_sumstats.Rd
+++ b/man/import_sumstats.Rd
@@ -158,6 +158,12 @@ which removes all non-autosomal SNPs.}
 the reference genome by SNP ID. Default is TRUE.}
     \item{\code{infer_eff_direction}}{Binary Should a check take place to ensure the
 alleles match the effect direction? Default is TRUE.}
+    \item{\code{eff_on_minor_alleles}}{Binary Should MungeSumstats assume that the
+effects are majoritively measured on the minor alleles? Default is FALSE as
+this is an assumption that won't be appropriate in all cases. However, the
+benefit is that if we know the majority of SNPs have their effects based on
+the minor alleles, we can catch cases where the allele columns have been
+mislabelled.}
     \item{\code{strand_ambig_filter}}{Binary Should SNPs with strand-ambiguous alleles
 be removed. Default is FALSE.}
     \item{\code{allele_flip_check}}{Binary Should the allele columns be checked against
diff --git a/man/infer_effect_column.Rd b/man/infer_effect_column.Rd
index 5cd1444e..9aa33c84 100644
--- a/man/infer_effect_column.Rd
+++ b/man/infer_effect_column.Rd
@@ -13,6 +13,7 @@ infer_effect_column(
   ref_genome = NULL,
   on_ref_genome = TRUE,
   infer_eff_direction = TRUE,
+  eff_on_minor_alleles = FALSE,
   return_list = TRUE
 )
 }
@@ -44,6 +45,13 @@ the reference genome by SNP ID. Default is TRUE.}
 \item{infer_eff_direction}{Binary Should a check take place to ensure the
 alleles match the effect direction? Default is TRUE.}
 
+\item{eff_on_minor_alleles}{Binary Should MungeSumstats assume that the
+effects are majoritively measured on the minor alleles? Default is FALSE as
+this is an assumption that won't be appropriate in all cases. However, the
+benefit is that if we know the majority of SNPs have their effects based on
+the minor alleles, we can catch cases where the allele columns have been
+mislabelled.}
+
 \item{return_list}{Return the \code{sumstats_dt} within a named list
 (default: \code{TRUE}).}
 }
@@ -53,22 +61,31 @@ table object
 }
 \description{
 Three checks are made to infer which allele the effect/frequency information
-relates to if they are ambiguous (named A1 and A2 or equivalent):
+relates to if they are ambiguous (named A0, A1 and A2 or equivalent):
 \enumerate{
-\item Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or
+\item Check if ambiguous naming conventions are used (i.e. allele 0, 1 and 2 or
 equivalent). If not exit, otherwise continue to next checks. This can be
 checked by using the mapping file and splitting A1/A2 mappings by those that
-contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect,
+contain 0, 1 or 2 (ambiguous) or doesn't contain 0, 1 or 2 e.g. effect,
 tested (unambiguous so fine for MSS to handle as is).
-\item Look for effect column/frequency column where the A1/A2 explicitly
-mentioned, if found then  we know the direction and should update A1/A2
+\item Look for effect column/frequency column where the A0/A1/A2 explicitly
+mentioned, if found then  we know the direction and should update A0/A1/A2
 naming so A2 is the effect column. We can look for such columns by getting
-every combination of A1/A2 naming and effect/frq naming.
+every combination of A0/A1/A2 naming and effect/frq naming.
 \item If not found in 2, a final check should be against the reference genome,
-whichever of A1 and A2 has more of a match with the reference genome should
-be taken as \strong{not} the effect allele. There is an assumption in this but is
-still better than guessing the ambiguous allele naming.
+whichever of A0, A1 and A2 has more of a match with the reference genome
+should be taken as \strong{not} the effect allele. There is an assumption in this
+but is still better than guessing the ambiguous allele naming.
+}
 }
+\details{
+Also, if eff_on_minor_alleles=TRUE, check 3 will be used in all cases.
+However, This assumes that the effects are majoritively measured on the
+minor alleles and should be used with caution as this is an assumption that
+won't be appropriate in all cases. However, the benefit is that if we know
+the majority of SNPs have their effects based on the minor alleles, we can
+catch cases where the allele columns have been mislabelled. IF
+eff_on_minor_alleles=TRUE, checks 1 and 2 will be skipped.
 }
 \examples{
 sumstats <- MungeSumstats::formatted_example()
diff --git a/man/standardise_header.Rd b/man/standardise_header.Rd
index fd18b1a9..20b6100f 100644
--- a/man/standardise_header.Rd
+++ b/man/standardise_header.Rd
@@ -9,6 +9,7 @@ standardise_header(
   sumstats_dt,
   mapping_file = sumstatsColHeaders,
   uppercase_unmapped = TRUE,
+  convert_A0 = TRUE,
   return_list = TRUE
 )
 }
@@ -27,6 +28,10 @@ data(sumstatsColHeaders) for default mapping and necessary format.}
 the \code{mapping_file}, return them in the same format they were input as
 (without forcing them to uppercase).}
 
+\item{convert_A0}{Whether to convert A* (representing A0) to A1/A2. This
+should be done unless checking if A0 was present in the input as if you do
+it you can't infer this. Default is TRUE}
+
 \item{return_list}{Return the \code{sumstats_dt} within a named list
 (default: \code{TRUE}).}
 }
diff --git a/man/validate_parameters.Rd b/man/validate_parameters.Rd
index af6bd25a..b57ab0e0 100644
--- a/man/validate_parameters.Rd
+++ b/man/validate_parameters.Rd
@@ -24,6 +24,7 @@ validate_parameters(
   rmv_chr,
   on_ref_genome,
   infer_eff_direction,
+  eff_on_minor_alleles,
   strand_ambig_filter,
   allele_flip_check,
   allele_flip_drop,
@@ -134,6 +135,13 @@ the reference genome by SNP ID. Default is TRUE.}
 \item{infer_eff_direction}{Binary Should a check take place to ensure the
 alleles match the effect direction? Default is TRUE.}
 
+\item{eff_on_minor_alleles}{Binary Should MungeSumstats assume that the
+effects are majoritively measured on the minor alleles? Default is FALSE as
+this is an assumption that won't be appropriate in all cases. However, the
+benefit is that if we know the majority of SNPs have their effects based on
+the minor alleles, we can catch cases where the allele columns have been
+mislabelled.}
+
 \item{strand_ambig_filter}{Binary Should SNPs with strand-ambiguous alleles
 be removed. Default is FALSE.}