R/quantile_flag.R

#' @title Test and flag outliers using quantiles
#'
#' @description Test and flag outliers using the quantiles to set upper and lower limits
#' for values of a trait for every species in a dataframe.
#'
#' @details
#' The quantile_flag function returns a dataframe with outliers flagged.
#' It uses quantiles to set the upper and lower limit of values of a trait to test for outliers.
#' It creates a column "measurementStatus" with the values 'outlier', 'too few records', or 
#' 'possible adult, possibly good'. 
#' The function is useful for non-normally and non-log normally distributed data.
#' The function is called with parameters that correspond to values contained in the data itself 
#' which act as a filter on the returned record set.
#' The function goes through every species in the datase t for the specified trait.
#'
#' @param data (string) a dataframe, assumed to be in FuTRES format
#' @param trait (string) a measurementType of interest, a list of traits can be found using
#' futres_traits()
#' @param taxa (string) a species (scientificName) within the dataset.
#' By default, the function goes through every unique species in the dataset.
#' @param stage (string) the life stage of the group (e.g., adult).
#' @param status (string) a list of values to ignore when making upper and lower limits.
#' The default is to ignore "too few records" and "outliers".
#' @param quant (integer) the number specifing the quaniles for setting the upper and lower limits.
#' By default, the functions sets the quantile to 0.05.
#' @param sample.min (integer) a limit for the number of samples required to test for
#' outliers. Defaults to a minimum of 3 individuals.
#' @export
#' @keywords outlier quantile

#' @return Return value containing a dataframe.
#'
#' @examples
#' wildcat.store <- futres_data(scientificName = "Puma concolor")
#' wildcat  <- wildcat.store$data
#' 
#' wildcat.quant <- quantile_flag(data = wildcat, trait = "body mass")
#'
#' print(wildcat.quant)

# Quantile flag function
quantile_flag <- function(
    data = NULL,
    trait = NULL,
    taxa = NULL,
    stage = NULL,
    status = NULL,
    quant = NULL,
    sample.min = NULL
)
{
  
  if(isTRUE(is.null(data))){
    stop("The argument 'data' is missing, please enter a dataframe")
  }
  
   if(!isTRUE(is.data.frame(data))){
    stop("Data is not a dataframe")
  }
  
  if(isTRUE(is.null(trait))){
    stop("The argument 'trait' is missing, please enter a trait value")
  }
 
  #create column to record sample size
  if(!isTRUE(colnames(data) %in% "sample.size")){
    data[, "sample.size"] <- ""
  }
  
  #create column for measurementStatus if it does not already exist
  if(!isTRUE(colnames(data) %in% "measurementStatus")){
    data[, "measurementStatus"] <- ""
  }
  
  data[, "measurementValue"] <- as.numeric(data[, "measurementValue"])
  
  #create index of species names to go through
  #by default goes through all species in the dataset
  if(isTRUE(is.null(taxa))){
    sp <- unique(data[,"scientificName"])
  }
  else{
    sp <- taxa
  }
  
  if(isTRUE(is.null(sample.min))){
    n.limit = 3
  }
  else{
    n.limit = sample.min
  }
  
  if(isTRUE(is.null(quant))){
    quant = 5 
  }
  else{
    quant = quant
  }
  
  steps = quant*.01
  
  if(isTRUE(is.null(status))){
    status = c("too few records", "outlier") 
  }
  else{
    status = status
  }
    
  data[,"index"] <- rownames(data)
  
  #create new columns if they don't currently exist
  ##if don't have one of these columns, likely don't have any
  if(!(isTRUE(colnames(data) %in% "upperLimit"))){
    data[,"upperLimit"] <- ""
    data[,"lowerLimit"] <- ""
    data[,"limitMethod"] <- ""
  }

  percent <- seq(0, 1, steps)
  index <- seq(1, length(percent), 1)
  #q <- data.frame(percent,index)
  
  lower.quant.index = 2 #this will always be 2, because 1 = 0
  upper.quant.index = length(index)-1 #this will be 1 less than the length of index, length of index = 100%
 
  for(i in 1:length(sp)){
    sub <- subset(data, subset = data[, "scientificName"] == sp[i] &
                                 data[, "measurementType"] == trait &
                                 !(data[, "measurementStatus"] %in% status))
  
    #if they have lifeStage not null, trim the dataset more
    if(!isTRUE(is.null(stage))){
      sub <- subset(sub, subset = sub[, "lifeStage"] == stage)
    }
  
    #make numeric
    sub[, "measurementValue"] <- as.numeric(sub[, "measurementValue"]) 
  
    #remove NAs from measurementValue
    sub <- sub[!is.na("measurementValue"),]
  
    #calculate sample size for records being included in normality test
    data$sample.size[data[, "scientificName"] == sp[i] &
                     data[, "measurementType"] == trait] <- as.numeric(nrow(sub))
    
    data[, "sample.size"] <- as.numeric(data[, "sample.size"])
  
    #calculate upper quantile limit
    data$upperLimit[data[, "scientificName"] == sp[i] &
                    data[, "measurementType"] == trait] <- quantile(sub$measurementValue, probs = seq(0,1,steps), na.rm = TRUE)[[upper.quant.index]]
  
    data[, "upperLimit"] <- as.numeric(data[, "upperLimit"])
    
    #calculate lower quantile limit
    data$lowerLimit[data[, "scientificName"] == sp[i] &
                    data[, "measurementType"] == trait] <- quantile(sub$measurementValue, probs = seq(0,1,steps), na.rm = TRUE)[[lower.quant.index]]
  
    data[, "lowerLimit"] <- as.numeric(data[, "lowerLimit"])
    
    #specify method
    data$limitMethod[data[, "scientificName"] == sp[i] &
                     data[, "measurementType"] == trait] <- "quantile" #label method
  }

  data$measurementStatus[data[, "sample.size"] < n.limit &
                         data[, "measurementType" == trait]] <- "too few records"

  for(i in 1:length(sp)){
    sub <- subset(data, data[, "scientificName"] == sp[i] &
                        data[, "measurementType"] == trait &
                        !(data[, "measurementStatus"] %in% status))
    for(j in 1:nrow(sub)){
      if(isTRUE(sub$measurementValue[j] < sub$lowerLimit[1])){ 
        data$measurementStatus[data$index == sub$index[j]] <- "possible juvenile"
      }
      else if(isTRUE(sub$measurementValue[j] > sub$upperLimit[1])){
        data$measurementStatus[data$index == sub$index[j]] <- "outlier"
      }
      else{
        data$measurementStatus[data$index == sub$index[j]] <- "possible adult, possibly good"
      }
    }
  } 

  return(data)
}